Compare commits

..

58 Commits

Author SHA1 Message Date
Леонид Юрьев (Leonid Yuriev)
9d7c5243d2 mdbx: update ChangeLog. 2025-11-12 18:10:35 +03:00
Леонид Юрьев (Leonid Yuriev)
5b740913d0 mdbx++: add missing std::hash<mdbx::buffer<>>. 2025-11-12 17:54:57 +03:00
Леонид Юрьев (Leonid Yuriev)
2f33449f2a mdbx-windows: use timeouts instead of retries for file locking. 2025-11-12 17:54:57 +03:00
Leonid Yuriev
aac890314c mdbx-windows: refine assertion handling for debug builds. 2025-11-12 17:54:40 +03:00
Leonid Yuriev
cfb9a55ebc mdbx: fix tls-dtor in case library unloading with an incomplete initialized env instance. 2025-11-10 21:38:06 +03:00
Леонид Юрьев (Leonid Yuriev)
cdbf2ed856 mdbx: update ChangeLog. 2025-11-08 00:25:04 +03:00
Леонид Юрьев (Leonid Yuriev)
f2f2cc3b40 mdbx: add workaround for ext4 fast-commit bug. 2025-11-07 21:17:18 +03:00
Леонид Юрьев (Leonid Yuriev)
3f8dad1ede mdbx-test: minor fix jitter test internals. 2025-11-07 21:11:21 +03:00
Леонид Юрьев (Leonid Yuriev)
a17e041830 mdbx: refactoring fetch/refresh/create/open tables and DBI-handles.
The (rare, quirky) scenario of recreating a previously opened and used table/DBI-handle
after it has been deleted by another process is now supported.
2025-11-06 10:26:28 +03:00
Леонид Юрьев (Leonid Yuriev)
207fc11d76 mdbx: update ChangeLog. 2025-11-05 14:51:44 +03:00
Леонид Юрьев (Leonid Yuriev)
3813333b28 mdbx: clean reader locktable by rthc_drown() only when no inprocess-neighbor. 2025-11-05 14:51:44 +03:00
Леонид Юрьев (Leonid Yuriev)
7628369819 mdbx: fix dummy coverity-scan warning. 2025-11-05 14:51:44 +03:00
Леонид Юрьев (Leonid Yuriev)
3bec0dbc6e mdbx: ending the transaction and return an error in case reader-slot was evicted. 2025-11-05 14:51:44 +03:00
Леонид Юрьев (Leonid Yuriev)
c07cfd30e1 mdbx: refine description of MDBX_BAD_RSLOT. 2025-11-05 14:51:44 +03:00
Леонид Юрьев (Leonid Yuriev)
ffb822cb61 mdbx: refine internal env_info_sys(). 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
7f8e3c8781 mdbx: more refining chk-output. 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
916e6e817d mdbx: refine handling sys_allocation_granularity. 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
bf3f9be98a mdbx: using clang-format-22. 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
7b112df36e mdbx: fix unused functions warning from modern clang. 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
4073330ad7 mdbx: patch update for older versions of buildroot. 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
f525e4d292 mdbx: merge-in ChangeLog from the stable/0.13.x branch. 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
baf3eb267f mdbx: fix extra type-casting typo. 2025-11-05 01:20:10 +03:00
Леонид Юрьев (Leonid Yuriev)
0b24446e8e mdbx: fix typo assertion-failure regression. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
8bd2ae9f20 mdbx: update ChangeLog. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
f488d84dc7 mdbx-cmake: cleanup from linking with libm. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
f695a1b48e mdbx-make: cleanup from linking with libm. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
5fb45cb3c9 mdbx-make: add -Wl,--as-needed' to LDFLAGS. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
0838af8f3d mdbx-tools: cleanup mdbx_stat from float-point. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
255a431bc1 mdbx-tools: cleanup mdbx_chk from float-point. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
dccc807aff mdbx: add float-point-free mdbx_ratio2percents() and mdbx_ratio2digits() to API for tools. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
8b2aa9fb65 mdbx: add getenv_bool() and fetching debug-options from the process environment. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
0d9b59dda1 mdbx: add osal_strcasecmp() and osal_strncasecmp(). 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
65184ff73b mdbx: rework/refine chk reporting tree/pages/tables information (squashed). 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
b8f4d6ccdd mdbx: rename "other" pages to "broken" in chk-output. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
c466dea250 mdbx: clean library core from using a float-point. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
1cf65cd880 mdbx: refine handling the MDBX_WITHOUT_MSVC_CRT option. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
7e43e14c7b mdbx: more info-output from mdbx_env_chk().
Print system io-block size, unified-page-cache block size, space allocated for the dxb-file in a filesystem.
2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
410bbbd9a5 mdbx: extending MDBX_envinfo. 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
06b6739e68 mdbx: fix assertion inside gc_alloc_ex(). 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
045968b46a mdbx: fix txn_basis_snapshot(). 2025-11-05 01:20:09 +03:00
Леонид Юрьев (Leonid Yuriev)
e292e8178c mdbx: update ChangeLog. 2025-10-29 00:14:26 +03:00
Леонид Юрьев (Leonid Yuriev)
60d5ba9790 mdbx: minor workaround for HarmonyOS's bug.
The libc from HarmonyOS SDK erroneously defines `EOWNERDEAD`,
`_POSIX_THREAD_PROCESS_SHARED >= 200809` and `PTHREAD_MUTEX_ROBUST` but
the same time doesn't provide `pthread_mutexattr_setrobust()` and
`pthread_mutex_consistent()`.

This commit add a minor workaround for such bug for successful building
without explicitly defining `MDBX_LOCKING=2001`.

Related to https://github.com/erthink/libmdbx/issues/285, https://github.com/vorot93/libmdbx-rs/issues/41, https://github.com/isar-community/isar-community/issues/28.
2025-10-29 00:10:06 +03:00
Leonid Yuriev
43c4503a77 mdbx: minor clean-up tautology in assertion. 2025-10-26 18:02:04 +03:00
Leonid Yuriev
aafe0f0fba mdbx: use Windows10 SDK by default. 2025-10-26 18:02:04 +03:00
Леонид Юрьев (Leonid Yuriev)
dc5f119de1 mdbx-cmake: support of MDBX_USE_FALLOCATE for CMake and Conan. 2025-10-20 12:52:36 +03:00
Леонид Юрьев (Leonid Yuriev)
a14fe7f195 mdbx: append TODO. 2025-10-20 12:17:46 +03:00
Леонид Юрьев (Leonid Yuriev)
4d6eb8a959 mdbx: update ChangeLog. 2025-10-18 12:36:36 +03:00
Леонид Юрьев (Leonid Yuriev)
2b0bfb9eea mdbx: revert/drop MDBX_DBG_NOFALLOC_INCORE and introduce osal_fsetsize().
This fixes regression after the 2a7f460345 as when
a DXB file remains longer than necessary on Mac or Linux when building without `_GNU_SOURCE`.
2025-10-18 12:36:29 +03:00
Леонид Юрьев (Leonid Yuriev)
5f2f5f34e0 mdbx-tests: fix minor typo in the battery-tmux script. 2025-10-17 14:31:17 +03:00
Леонид Юрьев (Leonid Yuriev)
a52fba9dbc mdbx: update ChangeLog. 2025-10-10 09:47:23 +03:00
Леонид Юрьев (Leonid Yuriev)
ee6a045f17 mdbx: add MDBX_DBG_NOFALLOC_INCORE.
It is a workaround to sporadic test failures due to lack of space in tmpfs and/or free memory.
2025-10-09 23:26:18 +03:00
Леонид Юрьев (Leonid Yuriev)
ed2cb62f39 mdbx-doc: refine doxygen comments for enum MDBX_debug_flags_t. 2025-10-09 23:26:18 +03:00
Леонид Юрьев (Leonid Yuriev)
924581bdc8 mdbx: merge-in ChangeLog from the stable/0.13.x branch. 2025-09-18 09:21:46 +03:00
Stefan de Konink
19db693d00 Doc: add difference between mdbx_dbi_open and mdbx_dbi_open2 2025-09-16 13:42:59 +03:00
Леонид Юрьев (Leonid Yuriev)
48c3805a96 mdbx: update TODO. 2025-09-12 12:49:54 +03:00
Леонид Юрьев (Leonid Yuriev)
07b07e19b3 mdbx: update ChangeLog. 2025-09-11 20:16:08 +03:00
Леонид Юрьев (Leonid Yuriev)
bdbbf3db68 mdbx: fix rare/specific unexpected assertion failure bmi > 0 on 32-bit debug builds. 2025-09-11 19:06:14 +03:00
Леонид Юрьев (Leonid Yuriev)
f2a5ca26a6 mdbx: using atomic_yield() inside safe64_read() retry loop. 2025-09-09 22:26:40 +03:00
50 changed files with 1305 additions and 846 deletions

View File

@@ -335,19 +335,6 @@ if(NOT APPLE
endif()
endif()
check_function_exists(pow NOT_NEED_LIBM)
if(NOT_NEED_LIBM)
set(LIB_MATH "")
else()
set(CMAKE_REQUIRED_LIBRARIES m)
check_function_exists(pow HAVE_LIBM)
if(HAVE_LIBM)
set(LIB_MATH m)
else()
message(FATAL_ERROR "No libm found for math support")
endif()
endif()
if(SUBPROJECT)
if(NOT DEFINED BUILD_SHARED_LIBS)
option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)" OFF)
@@ -705,6 +692,8 @@ mark_as_advanced(MDBX_ENABLE_PROFGC)
add_option(MDBX ENABLE_DBI_SPARSE
"Support for sparse sets of DBI handles to reduce overhead when starting and processing transactions" ON)
add_option(MDBX ENABLE_DBI_LOCKFREE "Support for deferred releasing and a lockfree path to quickly open DBI handles" ON)
add_option(MDBX USE_FALLOCATE "Using posix_fallocate() or fcntl(F_PREALLOCATE) on OSX" AUTO)
mark_as_advanced(MDBX_USE_FALLOCATE)
if(NOT MDBX_AMALGAMATED_SOURCE)
if(CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG")
@@ -1034,10 +1023,6 @@ if(MDBX_BUILD_TOOLS)
target_setup_options(mdbx_${TOOL})
target_link_libraries(mdbx_${TOOL} ${TOOL_MDBX_LIB})
endforeach()
if(LIB_MATH)
target_link_libraries(mdbx_chk ${LIB_MATH})
target_link_libraries(mdbx_stat ${LIB_MATH})
endif()
endif()
# ######################################################################################################################

View File

@@ -9,7 +9,7 @@ Please use the `stable` branch or the latest release for production environment
Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
Всё будет хорошо!
## v0.14.2 в активной разработке без конкретизации даты выпуска
## v0.14.2 в разработке без конкретизации даты выпуска
Продолжение развития нового куста/линейки версий с добавлением функционала, расширением API и внутренними переработками.
@@ -17,6 +17,7 @@ Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
- [Erigon](https://erigon.tech/) за спонсорство.
- [Артёму Воротникову](https://github.com/vorot93) за сообщение об ошибках и тестировании [призязок для Rust](https://github.com/vorot93/libmdbx-rs).
- [Stefan de Konink](https://github.com/skinkie) for fixing [Python bindings](https://github.com/wtdcode/mdbx-py) and documentation improvement.
Новое:
@@ -35,6 +36,15 @@ Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
- В API копирования БД добавлена опция `MDBX_CP_OVERWRITE` (перезапись целевого файла),
а в утилиту `mdbx_copy` аналогичная по смыслу опция командной строки `-f` .
- Поддержка Harmony OS (OHOS).
- Операции с плавающей точкой больше не используются как внутри библиотеки, так и в утилитах, а из сценариев сборки удалено связывание c `libm`.
- Обеспечена возможность установки отладочных опций `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` и других, через переменные среды окружения.
Но соответствующие отладочные возможности по-прежнему должны быть активированы во время сборки.
- Расширен и переработан состав информации формируемой функцией `mdbx_chk_env()` и выводимой утилитой `mdbx_chk`.
Исправления:
- Устранена критическая ошибка в функционале `mdbx_env_resurrect_after_fork()` при использовании SysV-семафоров.
@@ -63,20 +73,41 @@ Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
- Устранено получение неожиданного `SIGBUS` из-за отложенного/ленивого выделение места в заполненной файловой системе после приращения файла БД.
Более подробное пояснение в комментарии коммита [`2a7f460345edbeb26a51782cbe6af3c55254ae77`](https://gitflic.ru/project/erthink/libmdbx/commit/2a7f460345edbeb26a51782cbe6af3c55254ae77).
- Исправлена assert-проверка в пути сканирования битовой карты DBI-дексрипторов приводившая к редким падениям 32-битных отладочных сборок.
- Переделан поиск утилит `lib.exe` и `dlltool.exe` при сборке посредством CMake на Windows.
- Устранено падение при выполнении Thread-Local-Storage конструкторов при выгрузке библиотеки и наличия экземпляров env, инициализация которых не была завершена.
- В C++ API добавлена упущенная специализация шаблона `std::hash<mdbx::buffer<...>>`.
Изменение поведения:
- Вновь включена/разрешена на старых ядрах Linux, начиная с версии 3.16, так как
сейчас уже нет причин отказываться от работы на 3.16 поддерживая при этом ядра 4.x,
и еще есть проекты (Isar, Isar-Community, Hive) которым требуется такая поддержка.
- Изменено значение по-умолчанию порога слияния страниц с 25% до 33%.
- Ошибка `MDBX_WANNA_RECOVERY` при открытии БД в режиме только-чтение теперь возвращается если размер БД не кратен размеру системной страницы,
но игнорируется не кратность размеру блока выделения виртуальной памяти. Этим устраняется регресс, проявившейся вследствие изменения поведения
после задействования системного вызова `fallocate()` для предотвращения `SIGBUS` после приращения файла БД в заполненной файловой системе.
- Для уменьшения вероятности неожиданных ошибок, вследствие переходных процессах и отложенной обработки в ядре ОС при конкурентном закрытии и
открытии БД разными процессами, втрое увеличено количество повторных попыток захвата блокировок. Предположительно это также решит проблему
неожиданных ошибок `EAGAIN` (11) на Android при рестарте приложений и открытия БД сразу после закрытия.
- По-умолчанию сборка для Windows теперь выполняется с использованием SDK уровня Windows 10, а не Windows 7.
- Изменён размер и состав структуры `MDBX_envinfo`, а функция `mdbx_env_info_ex()` больше не поддерживает старые варианты.
Этим нарушена совместимость ABI со старыми версиями библиотеке, но сохранена совместимость API на уровне исходного кода.
Прочие доработки:
- Доработана логика отказа от использования OFD-блокировок на POSIX-платформах.
Теперь кроме `EINVAL` учитываются дополнительные коды ошибок (`ENOSYS`, `ENOIMPL`, `ENOTSUP`, `ENOSUPP`, `EOPNOTSUPP`),
что позволит работать собранной библиотеке в некоторых случаях, когда актуальное ядро/контейнер/эмулятор не поддерживает требуемых системных вызовов.
- Изменено значение по-умолчанию порога слияния страниц с 25% до 33%.
- Тесты дополнены сценариями для проверки добавленных возможностей, выявленных регрессов и ошибок.
- В тестовый фреймворк добавлена поддержка опции --numa # для привязки стохастического теста к NUMA-узлу,
@@ -87,6 +118,16 @@ Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
- В функционал проверки целостности БД и утилиту mdbx_chk добавлен вывод гистограммы заполнения страниц образующих структуру дерева и участвующих в операциях разделения/слияния/перебалансировки.
- Для Android добавлен обход (workaround) для уменьшения вероятности системной ошибки `EAGAIN` возникающей
из-за нехватки системных ресурсов и переходных процессов при закрытии и быстром повтороном открытии БД.
- Для Linux добавлено предотвращение проявления ошибки в реализации fast_commit файловой системы Ext4.
- В отладочные сборки на Windows при срабатывании assert-проверок добавлена поддержка вариантов "Пропустить" и "Повторить".
- В используемых на платформе Windows файловых блокировках задействованы ожидания с таймаутами,
что теоретически должно снизить вероятность возникновения ошибок `ERROR_LOCK_VIOLATION` (`33`) при открытии БД в конкуррентных сценариях.
--------------------------------------------------------------------------------
@@ -372,6 +413,69 @@ Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
********************************************************************************
## v0.13.9 "ИС-2" (IS-2) от 2025-10-31
Поддерживающий выпуск стабильной ветки с исправлением обнаруженных ошибок и устранением недочётов.
Выпуск назван в память о cамом мощном тяжелом советском танке ["ИС-2"](https://ru.ruwiki.ru/wiki/ИС-2), который был принят на вооружение
31 октября 1943 года в разгар Великой Отечественной Войны и долгое время оставался одной из сильнейших машин мира в категории по массе 40—50 тонн.
Благодарности:
- [Erigon](https://erigon.tech/) за спонсорство.
Исправления:
- Исправлена assert-проверка в пути сканирования битовой карты DBI-дескрипторов приводившая к редким падениям 32-битных отладочных сборок.
- Переделан поиск утилит `lib.exe` и `dlltool.exe` при сборке посредством CMake на Windows.
- Устранён регресс проявлявшийся увеличением (не-уменьшением) размера БД, после добавления использования `fallocate()`
ради предотвращения SIGBUS при нехватке места в файловой системе где расположена БД.
- Устранена опечатка в тестовом скрипте `test/battery-tmux.sh` приводящая к созданию мусорного файла с именем `-`.
- Удалено лишнее/ненужное использование макроса `MDBX_INTERNAL` оставшееся после рефакторинга.
- Для Android добавлен обход (workaround) для уменьшения вероятности системной ошибки `EAGAIN` возникающей
из-за нехватки системных ресурсов и переходных процессов при закрытии и быстром повторном открытии БД.
Прочие доработки:
- Поддержка Harmony OS (OHOS).
--------------------------------------------------------------------------------
## v0.13.8 "Всеобуч" (v`seabooch) от 2025-08-31
Поддерживающий выпуск стабильной ветки с исправлением обнаруженных ошибок и устранением недочётов,
в день 100 летнего юбилея Постановления Всероссийского центрального исполнительного комитета о всеобщем бесплатном начальном образовании.
Благодарности:
- [Erigon](https://erigon.tech/) за спонсорство.
Исправления:
- Устранена возможность получения неожиданного `SIGBUS` из-за отложенного/ленивого выделение места в заполненной файловой системе после приращения файла БД.
Более подробное пояснение в комментарии коммита [`2930b304dc674bbccd188b7ce7c3f83755ef706e`](https://gitflic.ru/project/erthink/libmdbx/commit/2930b304dc674bbccd188b7ce7c3f83755ef706e).
Изменение поведения:
- Вновь включена/разрешена на старых ядрах Linux, начиная с версии 3.16, так как
сейчас уже нет причин отказываться от работы на 3.16 поддерживая при этом ядра 4.x,
и еще есть проекты (Isar, Isar-Community, Hive) которым требуется такая поддержка.
- Ошибка `MDBX_WANNA_RECOVERY` при открытии БД в режиме только-чтение теперь возвращается если размер БД не кратен размеру системной страницы,
но игнорируется не кратность размеру блока выделения виртуальной памяти.
Этим устраняется регресс, проявившейся вследствие изменения поведения после задействования
системного вызова `fallocate()` для предотвращения `SIGBUS` после приращения файла БД в заполненной файловой системе.
--------------------------------------------------------------------------------
## v0.13.7 "Дружба" (Friendship) от 2025-07-30.
Поддерживающий выпуск стабильной ветки с исправлением обнаруженных ошибок и устранением недочётов,

View File

@@ -106,11 +106,12 @@ endef
define uname2ldflags
case "$(UNAME)" in
CYGWIN*|MINGW*|MSYS*|Windows*)
echo '-Wl,--gc-sections,-O1';
echo '-Wl,--gc-sections,-O1,--as-needed';
;;
*)
$(LD) --help 2>/dev/null | grep -q -- --gc-sections && echo '-Wl,--gc-sections,-z,relro,-O1';
$(LD) --help 2>/dev/null | grep -q -- -dead_strip && echo '-Wl,-dead_strip';
$(LD) --help 2>/dev/null | grep -q -- --as-needed && echo '-Wl,--as-needed';
;;
esac
endef
@@ -119,16 +120,16 @@ endef
define uname2libs
case "$(UNAME)" in
CYGWIN*|MINGW*|MSYS*|Windows*)
echo '-lm -lntdll -lwinmm';
echo '-lntdll -lwinmm';
;;
*SunOS*|*Solaris*)
echo '-lm -lkstat -lrt';
echo '-lkstat -lrt';
;;
*Darwin*|OpenBSD*)
echo '-lm';
echo '';
;;
*)
echo '-lm -lrt';
echo '-lrt';
;;
esac
endef
@@ -561,7 +562,7 @@ $(foreach file,$(TOOLS),$(eval $(call tool-rule,$(file))))
mdbx_test: $(TEST_OBJ) libmdbx.$(SO_SUFFIX)
@echo ' LD $@'
$(QUIET)$(CXX) $(CXXFLAGS) $(TEST_OBJ) -Wl,-rpath . -L . -l mdbx $(EXE_LDFLAGS) $(LIBS) -o $@
$(QUIET)$(CXX) $(CXXFLAGS) $(TEST_OBJ) -Wl,-rpath . -L . -l mdbx $(EXE_LDFLAGS) $(LIBS) -lm -o $@
$(MDBX_GIT_DIR)/HEAD $(MDBX_GIT_DIR)/index $(MDBX_GIT_DIR)/refs/tags:
@echo '*** ' >&2

View File

@@ -1,6 +1,9 @@
TODO
----
- add optional page-get and operation statistics for cursors.
- split ASSERT() to CHECK{0,1,2,3} and basal `assert()`.
- [SWIG](https://www.swig.org/).
- Параллельная lto-сборка с устранением предупреждений.
- Интеграция c DTrace и аналогами.
@@ -16,9 +19,15 @@ TODO
- [Support MessagePack for Keys & Values](https://libmdbx.dqdkfa.ru/dead-github/issues/115).
- Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc.
In development
--------------
- get-cached API.
- digging/refactoring/optimizing page splitting and tree rebalance.
Done
----
- HarmonyOS support.
- Ранняя/не-отложенная очистка GC.
- Рефакторинг gc-get/gc-put c переходом на "интервальные" списки.
- [Engage new terminology](https://libmdbx.dqdkfa.ru/dead-github/issues/137).

View File

@@ -82,6 +82,7 @@ class libmdbx(ConanFile):
'mdbx.use_mincore': ['Auto', True, False],
'mdbx.use_ofdlocks': ['Auto', True, False],
'mdbx.use_sendfile': ['Auto', True, False],
'mdbx.use_fallocate': ['Auto', True, False],
'mdbx.without_msvc_crt': ['Default', True, False],
'shared': [True, False],
}
@@ -113,6 +114,7 @@ class libmdbx(ConanFile):
'mdbx.use_mincore': 'Auto',
'mdbx.use_ofdlocks': 'Auto',
'mdbx.use_sendfile': 'Auto',
'mdbx.use_fallocate': 'Auto',
'mdbx.without_msvc_crt': 'Default',
'shared': True,
}
@@ -143,7 +145,8 @@ class libmdbx(ConanFile):
'mdbx.use_copyfilerange': 'Advanced: Use `copy_file_range()` syscall. ',
'mdbx.use_mincore': "Use Unix' `mincore()` to determine whether database pages are resident in memory. ",
'mdbx.use_ofdlocks': 'Advanced: Use POSIX OFD-locks. ',
'mdbx.use_sendfile': 'Advancedc: Use `sendfile()` syscall. ',
'mdbx.use_sendfile': 'Advanced: Use `sendfile()` syscall. ',
'mdbx.use_fallocate': 'Advanced: Use posix_fallocate() or fcntl(F_PREALLOCATE) on OSX. ',
'mdbx.without_msvc_crt': 'Avoid dependence from MSVC CRT and use ntdll.dll instead. ',
}
@@ -160,6 +163,7 @@ class libmdbx(ConanFile):
self.options.rm_safe('mdbx.mmap_incoherent_file_write')
self.options.rm_safe('mdbx.use_mincore')
self.options.rm_safe('mdbx.use_ofdlocks')
self.options.rm_safe('mdbx.use_fallocate')
else:
self.options.rm_safe('mdbx.without_msvc_crt')
if is_apple_os(self):

63
mdbx.h
View File

@@ -837,7 +837,9 @@ enum MDBX_constants {
/** Log level
* \note Levels detailed than (great than) \ref MDBX_LOG_NOTICE
* requires build libmdbx with \ref MDBX_DEBUG option. */
* requires build libmdbx with \ref MDBX_DEBUG option.
*
* \see mdbx_setup_debug() \see MDBX_log_level_t */
typedef enum MDBX_log_level {
/** Critical conditions, i.e. assertion failures.
* \note libmdbx always produces such messages regardless
@@ -894,24 +896,26 @@ typedef enum MDBX_log_level {
*
* \details `MDBX_DBG_DUMP` and `MDBX_DBG_LEGACY_MULTIOPEN` always have an
* effect, but `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` and `MDBX_DBG_JITTER` only if
* libmdbx built with \ref MDBX_DEBUG. */
* libmdbx built with \ref MDBX_DEBUG.
*
* \see mdbx_setup_debug() \see MDBX_debug_flags_t */
typedef enum MDBX_debug_flags {
MDBX_DBG_NONE = 0,
/** Enable assertion checks.
/** Enables assertion checks.
* \note Always enabled for builds with `MDBX_FORCE_ASSERTIONS` option,
* otherwise requires build with \ref MDBX_DEBUG > 0 */
MDBX_DBG_ASSERT = 1,
/** Enable pages usage audit at commit transactions.
/** Enables pages usage audit at commit transactions.
* \note Requires build with \ref MDBX_DEBUG > 0 */
MDBX_DBG_AUDIT = 2,
/** Enable small random delays in critical points.
/** Enables small random delays in critical points.
* \note Requires build with \ref MDBX_DEBUG > 0 */
MDBX_DBG_JITTER = 4,
/** Include or not meta-pages in coredump files.
/** Controls including of a database(s) meta-pages in coredump files.
* \note May affect performance in \ref MDBX_WRITEMAP mode */
MDBX_DBG_DUMP = 8,
@@ -921,9 +925,8 @@ typedef enum MDBX_debug_flags {
/** Allow read and write transactions overlapping for the same thread. */
MDBX_DBG_LEGACY_OVERLAP = 32,
/** Don't auto-upgrade format signature.
* \note However a new write transactions will use and store
* the last signature regardless this flag */
/** Disables automatic updating of the database format signature, i.e. upgrade database format on a media.
* \note Nonetheless a new write transactions will use and store the last signature regardless this flag */
MDBX_DBG_DONT_UPGRADE = 64,
#ifdef ENABLE_UBSAN
@@ -958,7 +961,9 @@ typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function, in
/** \brief Setup global log-level, debug options and debug logger.
* \returns The previously `debug_flags` in the 0-15 bits
* and `log_level` in the 16-31 bits. */
* and `log_level` in the 16-31 bits.
*
* \see MDBX_log_level_t \see MDBX_debug_flags_t */
LIBMDBX_API int mdbx_setup_debug(MDBX_log_level_t log_level, MDBX_debug_flags_t debug_flags, MDBX_debug_func *logger);
typedef void MDBX_debug_func_nofmt(MDBX_log_level_t loglevel, const char *function, int line, const char *msg,
@@ -1007,7 +1012,10 @@ MDBX_NORETURN LIBMDBX_API void mdbx_panic(const char *fmt, ...) MDBX_PRINTF_ARGS
/** \brief Panics with asserton failed message and causes abnormal process
* termination. */
MDBX_NORETURN LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, unsigned line);
#if !((defined(_WIN32) || defined(_WIN64)) && !MDBX_WITHOUT_MSVC_CRT)
MDBX_NORETURN
#endif /* MDBX_WITHOUT_MSVC_CRT */
LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, unsigned line);
/** end of c_debug @} */
/** \brief Environment flags
@@ -1904,8 +1912,7 @@ typedef enum MDBX_error {
* - The table was dropped and recreated with different flags. */
MDBX_INCOMPATIBLE = -30784,
/** Invalid reuse of reader locktable slot,
* e.g. read-transaction already run for current thread */
/** Reader locktable slot was unexpectly reused or cleared by an enemy thread */
MDBX_BAD_RSLOT = -30783,
/** Transaction is not valid for requested operation,
@@ -2810,17 +2817,20 @@ struct MDBX_envinfo {
uint64_t shrink; /**< Shrink threshold for datafile */
uint64_t grow; /**< Growth step for datafile */
} mi_geo;
uint64_t mi_mapsize; /**< Size of the data memory map */
uint64_t mi_mapsize; /**< Size of the database memory map */
uint64_t mi_dxb_fsize; /**< Current database file size */
uint64_t mi_dxb_fallocated; /**< Space allocated for the database file in a filesystem */
uint64_t mi_last_pgno; /**< Number of the last used page */
uint64_t mi_recent_txnid; /**< ID of the last committed transaction */
uint64_t mi_latter_reader_txnid; /**< ID of the last reader transaction */
uint64_t mi_self_latter_reader_txnid; /**< ID of the last reader transaction
of caller process */
uint64_t mi_self_latter_reader_txnid; /**< ID of the last reader transaction of this/current process */
uint64_t mi_meta_txnid[3], mi_meta_sign[3];
uint32_t mi_maxreaders; /**< Total reader slots in the environment */
uint32_t mi_numreaders; /**< Max reader slots used in the environment */
uint32_t mi_dxb_pagesize; /**< Database pagesize */
uint32_t mi_sys_pagesize; /**< System pagesize */
uint32_t mi_sys_upcblk; /**< System "Unified Page Cache" block size */
uint32_t mi_sys_ioblk; /**< Filesystem I/O block size */
/** \brief A mostly unique ID that is regenerated on each boot.
@@ -4562,6 +4572,10 @@ typedef int(MDBX_cmp_func)(const MDBX_val *a, const MDBX_val *b) MDBX_CXX17_NOEX
* \param [out] dbi Address where the new \ref MDBX_dbi handle
* will be stored.
*
* The name in \ref mdbx_dbi_open() is a null terminated string. While
* \ref mdbx_dbi_open2() supports arbitrary length keys which are not
* truncated, for example to support a fixed width integer type.
*
* For \ref mdbx_dbi_open_ex() additional arguments allow you to set custom
* comparison functions for keys and values (for multimaps).
* \see avoid_custom_comparators
@@ -4594,6 +4608,8 @@ LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flag
* \param [in] name The name of the table to open. If only a single
* table is needed in the environment,
* this value may be NULL.
* The name in \ref mdbx_dbi_open_ex() is null terminated,
* while \ref mdbx_dbi_open_ex2() supports an arbitrary length.
* \param [in] flags Special options for this table.
* \param [in] keycmp Optional custom key comparison function for a table.
* \param [in] datacmp Optional custom data comparison function for a table.
@@ -6534,17 +6550,17 @@ typedef struct MDBX_chk_table {
size_t payload_bytes, lost_bytes;
struct {
size_t all, empty, other;
size_t all, empty, broken;
size_t branch, leaf;
size_t nested_branch, nested_leaf, nested_subleaf;
} pages;
struct {
/// Tree deep histogram
struct MDBX_chk_histogram deep;
struct MDBX_chk_histogram height;
/// Histogram of large/overflow pages length
struct MDBX_chk_histogram large_pages;
/// Histogram of nested trees height, span length for GC
struct MDBX_chk_histogram nested_tree;
struct MDBX_chk_histogram nested_height;
/// Keys length histogram
struct MDBX_chk_histogram key_len;
/// Values length histogram
@@ -6552,9 +6568,9 @@ typedef struct MDBX_chk_table {
/// Number of multi-values (aka duplicates) histogram
struct MDBX_chk_histogram multival;
/// Histogram of branch and leaf pages filling in percents
struct MDBX_chk_histogram tree_filling;
struct MDBX_chk_histogram tree_density;
/// Histogram of nested tree(s) branch and leaf pages filling in percents
struct MDBX_chk_histogram nested_tree_filling;
struct MDBX_chk_histogram large_or_nested_density;
} histogram;
} MDBX_chk_table_t;
@@ -6661,6 +6677,11 @@ LIBMDBX_API int mdbx_env_chk(MDBX_env *env, const MDBX_chk_callbacks_t *cb, MDBX
* \returns Нулевое значение в случае успеха, иначе код ошибки. */
LIBMDBX_API int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx);
LIBMDBX_API const char *mdbx_ratio2digits(uint64_t numerator, uint64_t denominator, int precision, char *buffer,
size_t buffer_size);
LIBMDBX_API const char *mdbx_ratio2percents(uint64_t value, uint64_t whole, char *buffer, size_t buffer_size);
/** end of chk @} */
/** end of c_api @} */

View File

@@ -6415,6 +6415,12 @@ template <> struct hash<::mdbx::slice> {
MDBX_CXX14_CONSTEXPR size_t operator()(::mdbx::slice const &slice) const noexcept { return slice.hash_value(); }
};
template <class ALLOCATOR, typename CAPACITY_POLICY> struct hash<::mdbx::buffer<ALLOCATOR, CAPACITY_POLICY>> {
MDBX_CXX14_CONSTEXPR size_t operator()(::mdbx::buffer<ALLOCATOR, CAPACITY_POLICY> const &buffer) const noexcept {
return buffer.hash_value();
}
};
/// end cxx_api @}
} // namespace std

View File

@@ -1,13 +1,13 @@
From f2f1f6e76c1538d044b552d9e7ecedc3433e6cd9 Mon Sep 17 00:00:00 2001
From b2f1297dd2cd42cc0e04f1900fbf6da6c2694b7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?=
=?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= <leo@yuriev.ru>
Date: Sun, 3 Aug 2025 23:59:11 +0300
Date: Fri, 31 Oct 2025 16:58:31 +0300
Subject: [PATCH] package/libmdbx: new package (library/database).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch adds libmdbx:
This patch adds libmdbx 0.13.9:
- libmdbx is one of the fastest compact embeddable key-value ACID database.
- libmdbx has a specific set of properties and capabilities,
focused on creating unique lightweight solutions.
@@ -15,158 +15,55 @@ This patch adds libmdbx:
in terms of reliability, features and performance.
- more information at https://libmdbx.dqdkfa.ru
The 0.13.7 "Дружба" (Friendship) is stable release of _libmdbx_ branch with new superior features.
The complete ChangeLog: https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md
The 0.13.9 "ИС-2" (IS-2) is bugfix release of the stable branch.
For more information please see [ChangeLog](https://github.com/erthink/libmdbx/blob/stable/ChangeLog.md).
Signed-off-by: Леонид Юрьев (Leonid Yuriev) <leo@yuriev.ru>
---
DEVELOPERS | 3 +++
package/Config.in | 1 +
package/libmdbx/Config.in | 45 ++++++++++++++++++++++++++++++++++++
package/libmdbx/libmdbx.hash | 6 +++++
package/libmdbx/libmdbx.mk | 41 ++++++++++++++++++++++++++++++++
5 files changed, 96 insertions(+)
create mode 100644 package/libmdbx/Config.in
create mode 100644 package/libmdbx/libmdbx.hash
create mode 100644 package/libmdbx/libmdbx.mk
package/libmdbx/Config.in | 4 +++-
package/libmdbx/libmdbx.hash | 2 +-
package/libmdbx/libmdbx.mk | 2 +-
3 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/DEVELOPERS b/DEVELOPERS
index 9ab1e125f4..758ff6a2d5 100644
--- a/DEVELOPERS
+++ b/DEVELOPERS
@@ -1482,6 +1482,9 @@ N: Leon Anavi <leon.anavi@konsulko.com>
F: board/olimex/a10_olinuxino
F: configs/olimex_a10_olinuxino_lime_defconfig
+N: Leonid Yuriev <leo@yuriev.ru>
+F: package/libmdbx/
+
N: Lionel Flandrin <lionel@svkt.org>
F: package/python-babel/
F: package/python-daemonize/
diff --git a/package/Config.in b/package/Config.in
index 016a99ed1a..a6f95bfaa9 100644
--- a/package/Config.in
+++ b/package/Config.in
@@ -1372,6 +1372,7 @@ menu "Database"
source "package/kompexsqlite/Config.in"
source "package/leveldb/Config.in"
source "package/libgit2/Config.in"
+ source "package/libmdbx/Config.in"
source "package/libodb/Config.in"
source "package/libodb-boost/Config.in"
source "package/libodb-mysql/Config.in"
diff --git a/package/libmdbx/Config.in b/package/libmdbx/Config.in
new file mode 100644
index 0000000000..a9a4ac45c5
--- /dev/null
index a9a4ac45c5..1640dbd9de 100644
--- a/package/libmdbx/Config.in
+++ b/package/libmdbx/Config.in
@@ -0,0 +1,45 @@
+config BR2_PACKAGE_LIBMDBX
+ bool "libmdbx"
+ depends on BR2_USE_MMU
+ depends on BR2_TOOLCHAIN_HAS_SYNC_4
+ depends on BR2_TOOLCHAIN_HAS_THREADS
+ depends on BR2_TOOLCHAIN_GCC_AT_LEAST_4_4
+ help
+ One of the fastest compact key-value ACID database
+ without WAL. libmdbx has a specific set of properties
+ and capabilities, focused on creating unique lightweight
+ solutions.
+
+ libmdbx surpasses the legendary LMDB in terms of
+ reliability, features and performance.
+
+ https://libmdbx.dqdkfa.ru
+
+if BR2_PACKAGE_LIBMDBX
+
+config BR2_PACKAGE_LIBMDBX_TOOLS
+ bool "install tools"
+ help
+ Install libmdbx tools for checking, dump, restore
+ and show statistics of databases.
+
+config BR2_PACKAGE_LIBMDBX_CXX
+ bool "C++ API"
+ depends on BR2_INSTALL_LIBSTDCPP
+ depends on BR2_TOOLCHAIN_GCC_AT_LEAST_4_8
+ depends on !BR2_TOOLCHAIN_HAS_GCC_BUG_64735
+ help
+ Enable modern C++11/14/17/20 API for libmdbx.
+
+comment "libmdbx C++ support needs a toolchain w/ C++11, gcc >= 4.8 w/o bug#64735"
+ depends on !BR2_INSTALL_LIBSTDCPP || \
+ !BR2_TOOLCHAIN_GCC_AT_LEAST_4_8 || \
+ BR2_TOOLCHAIN_HAS_GCC_BUG_64735
+
+endif
+
+comment "libmdbx needs MMU, a toolchain w/ threads, gcc >= 4.4 w/ 4-byte atomics"
+ depends on BR2_USE_MMU
+ depends on !BR2_TOOLCHAIN_HAS_THREADS || \
+ !BR2_TOOLCHAIN_HAS_SYNC_4 || \
+ !BR2_TOOLCHAIN_GCC_AT_LEAST_4_4
@@ -11,7 +11,9 @@ config BR2_PACKAGE_LIBMDBX
solutions.
libmdbx surpasses the legendary LMDB in terms of
- reliability, features and performance.
+ reliability, features and performance. At the end of 2024
+ libmdbx was chosen by all modern Ethereum frontiers/nodes
+ as a storage engine.
https://libmdbx.dqdkfa.ru
diff --git a/package/libmdbx/libmdbx.hash b/package/libmdbx/libmdbx.hash
new file mode 100644
index 0000000000..8c7efb184b
--- /dev/null
index ae5266716b..4a4f302015 100644
--- a/package/libmdbx/libmdbx.hash
+++ b/package/libmdbx/libmdbx.hash
@@ -0,0 +1,6 @@
+# Hashes from: https://libmdbx.dqdkfa.ru/release/SHA256SUMS
+sha256 d00c1287ec6bbc366363ccdd3eea97bd470ccb5cc102d56b341f84a9fba7e8e9 libmdbx-amalgamated-0.13.7.tar.xz
+
+# Locally calculated
+sha256 0d542e0c8804e39aa7f37eb00da5a762149dc682d7829451287e11b938e94594 LICENSE
+sha256 651f71b46c6bb0046d2122df7f9def9cb24f4dc28c5b11cef059f66565cda30f NOTICE
@@ -1,5 +1,5 @@
# Hashes from: https://libmdbx.dqdkfa.ru/release/SHA256SUMS
-sha256 57db987de6f7ccc66a66ae28a7bda9f9fbb48ac5fb9279bcca92fd5de13075d1 libmdbx-amalgamated-0.13.6.tar.xz
+sha256 63d2608c8f7c23185c0d27d817d42dd720e84973224ffc584c7f7b522f5f06fe libmdbx-amalgamated-0.13.9.tar.xz
# Locally calculated
sha256 0d542e0c8804e39aa7f37eb00da5a762149dc682d7829451287e11b938e94594 LICENSE
diff --git a/package/libmdbx/libmdbx.mk b/package/libmdbx/libmdbx.mk
new file mode 100644
index 0000000000..bbb37f21a6
--- /dev/null
index f461d98397..62817a98f8 100644
--- a/package/libmdbx/libmdbx.mk
+++ b/package/libmdbx/libmdbx.mk
@@ -0,0 +1,41 @@
+################################################################################
+#
+# libmdbx
+#
+################################################################################
+
+LIBMDBX_VERSION = 0.13.7
+LIBMDBX_SOURCE = libmdbx-amalgamated-$(LIBMDBX_VERSION).tar.xz
+LIBMDBX_SITE = https://libmdbx.dqdkfa.ru/release
+LIBMDBX_SUPPORTS_IN_SOURCE_BUILD = NO
+LIBMDBX_LICENSE = Apache-2.0
+LIBMDBX_LICENSE_FILES = LICENSE NOTICE
+LIBMDBX_STRIP_COMPONENTS = 0
+LIBMDBX_INSTALL_STAGING = YES
+
+# Set CMAKE_BUILD_TYPE to Release to remove -Werror and avoid a build failure
+# with glibc < 2.12
+LIBMDBX_CONF_OPTS = \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DMDBX_INSTALL_MANPAGES=OFF \
+ -DBUILD_FOR_NATIVE_CPU=OFF \
+ -DMDBX_BUILD_CXX=$(if $(BR2_PACKAGE_LIBMDBX_CXX),ON,OFF) \
+ -DMDBX_BUILD_TOOLS=$(if $(BR2_PACKAGE_LIBMDBX_TOOLS),ON,OFF)
+
+ifeq ($(BR2_STATIC_LIBS)$(BR2_SHARED_STATIC_LIBS),y)
+LIBMDBX_CONF_OPTS += -DMDBX_INSTALL_STATIC=ON
+else
+LIBMDBX_CONF_OPTS += -DMDBX_INSTALL_STATIC=OFF
+endif
+
+ifeq ($(BR2_SHARED_LIBS)$(BR2_SHARED_STATIC_LIBS),y)
+LIBMDBX_CONF_OPTS += \
+ -DMDBX_BUILD_SHARED_LIBRARY=ON \
+ -DMDBX_LINK_TOOLS_NONSTATIC=ON
+else
+LIBMDBX_CONF_OPTS += \
+ -DMDBX_BUILD_SHARED_LIBRARY=OFF \
+ -DMDBX_LINK_TOOLS_NONSTATIC=OFF
+endif
+
+$(eval $(cmake-package))
@@ -4,7 +4,7 @@
#
################################################################################
-LIBMDBX_VERSION = 0.13.6
+LIBMDBX_VERSION = 0.13.9
LIBMDBX_SOURCE = libmdbx-amalgamated-$(LIBMDBX_VERSION).tar.xz
LIBMDBX_SITE = https://libmdbx.dqdkfa.ru/release
LIBMDBX_SUPPORTS_IN_SOURCE_BUILD = NO
--
2.50.1
2.51.2

View File

@@ -479,7 +479,7 @@ __cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *txn, mdbx_fileha
if (meta->geometry.now != meta->geometry.first_unallocated) {
const size_t whole_size = pgno2bytes(env, meta->geometry.now);
if (!dest_is_pipe)
return osal_fallocate(fd, whole_size);
return osal_fsetsize(fd, whole_size);
const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated);
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
@@ -648,7 +648,7 @@ retry_snap_meta:
/* Extend file if required */
if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) {
if (!dest_is_pipe)
rc = osal_fallocate(fd, whole_size);
rc = osal_fsetsize(fd, whole_size);
else {
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) {

View File

@@ -241,7 +241,7 @@ __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, siz
return LOG_IFERR(MDBX_BAD_TXN);
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
rc = tbl_fetch((MDBX_txn *)txn, dbi);
rc = tbl_refresh((MDBX_txn *)txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}

View File

@@ -245,7 +245,8 @@ __cold int mdbx_env_create(MDBX_env **penv) {
#if defined(_WIN32) || defined(_WIN64)
imports.srwl_Init(&env->remap_guard);
InitializeCriticalSection(&env->windowsbug_lock);
InitializeCriticalSection(&env->lck_event_cs);
InitializeCriticalSection(&env->dxb_event_cs);
#else
rc = osal_fastmutex_init(&env->remap_guard);
if (unlikely(rc != MDBX_SUCCESS)) {
@@ -638,7 +639,8 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
ENSURE(env, osal_fastmutex_destroy(&env->dbi_lock) == MDBX_SUCCESS);
#if defined(_WIN32) || defined(_WIN64)
/* remap_guard don't have destructor (Slim Reader/Writer Lock) */
DeleteCriticalSection(&env->windowsbug_lock);
DeleteCriticalSection(&env->lck_event_cs);
DeleteCriticalSection(&env->dxb_event_cs);
#else
ENSURE(env, osal_fastmutex_destroy(&env->remap_guard) == MDBX_SUCCESS);
#endif /* Windows */
@@ -664,11 +666,68 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
/*----------------------------------------------------------------------------*/
static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, const size_t bytes,
troika_t *const troika) {
const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid);
const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat);
const size_t size_before_dxbid = offsetof(MDBX_envinfo, mi_dxbid);
__must_check_result static int env_info_sys(const MDBX_env *env, MDBX_envinfo *out) {
out->mi_bootid.current.x = globals.bootid.x;
out->mi_bootid.current.y = globals.bootid.y;
out->mi_sys_pagesize = globals.sys_pagesize;
#ifdef __OpenBSD__
out->mi_sys_upcblk = 0;
#elif defined(_WIN32) || defined(_WIN64)
out->mi_sys_upcblk = globals.sys_allocation_granularity;
#elif defined(AT_UCACHEBSIZE)
out->mi_sys_upcblk = globals.sys_unified_cache_block;
#else
out->mi_sys_upcblk = globals.sys_pagesize;
#endif /* AT_UCACHEBSIZE */
out->mi_dxb_fsize = 0;
out->mi_dxb_fallocated = 0;
out->mi_sys_ioblk = 0;
if (env->dxb_mmap.fd != INVALID_HANDLE_VALUE) {
#if defined(_WIN32) || defined(_WIN64)
union {
BY_HANDLE_FILE_INFORMATION bh;
FILE_STANDARD_INFO std;
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
FILE_STORAGE_INFO storage;
#endif
} sys_finfo;
if (imports.GetFileInformationByHandleEx &&
imports.GetFileInformationByHandleEx(env->dxb_mmap.fd, FileStandardInfo, &sys_finfo.std,
sizeof(sys_finfo.std))) {
out->mi_dxb_fsize = sys_finfo.std.EndOfFile.QuadPart;
out->mi_dxb_fallocated = sys_finfo.std.AllocationSize.QuadPart;
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (imports.GetFileInformationByHandleEx(env->dxb_mmap.fd, FileStorageInfo, &sys_finfo.storage,
sizeof(sys_finfo.storage))) {
out->mi_sys_ioblk = (sys_finfo.storage.FileSystemEffectivePhysicalBytesPerSectorForAtomicity >
sys_finfo.storage.LogicalBytesPerSector)
? sys_finfo.storage.FileSystemEffectivePhysicalBytesPerSectorForAtomicity
: sys_finfo.storage.LogicalBytesPerSector;
}
#endif
} else if (GetFileInformationByHandle(env->dxb_mmap.fd, &sys_finfo.bh)) {
out->mi_dxb_fsize = sys_finfo.bh.nFileSizeLow | (uint64_t)sys_finfo.bh.nFileSizeHigh << 32;
} else
return GetLastError();
#else
struct stat sys_fstat;
if (fstat(env->dxb_mmap.fd, &sys_fstat))
return errno;
out->mi_dxb_fsize = sys_fstat.st_size;
out->mi_dxb_fallocated = UINT64_C(512) * sys_fstat.st_blocks;
out->mi_sys_ioblk = sys_fstat.st_blksize;
#endif /* !Windows */
}
return MDBX_SUCCESS;
}
__must_check_result static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out,
troika_t *const troika) {
int err = env_info_sys(env, out);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (unlikely(env->flags & ENV_FATAL_ERROR))
return MDBX_PANIC;
@@ -678,7 +737,6 @@ static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo
/* environment not yet opened */
#if 1
/* default behavior: returns the available info but zeroed the rest */
memset(out, 0, bytes);
out->mi_geo.lower = env->geo_in_bytes.lower;
out->mi_geo.upper = env->geo_in_bytes.upper;
out->mi_geo.shrink = env->geo_in_bytes.shrink;
@@ -686,11 +744,6 @@ static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo
out->mi_geo.current = env->geo_in_bytes.now;
out->mi_maxreaders = env->max_readers;
out->mi_dxb_pagesize = env->ps;
out->mi_sys_pagesize = globals.sys_pagesize;
if (likely(bytes > size_before_bootid)) {
out->mi_bootid.current.x = globals.bootid.x;
out->mi_bootid.current.y = globals.bootid.y;
}
return MDBX_SUCCESS;
#else
/* some users may prefer this behavior: return appropriate error */
@@ -710,13 +763,10 @@ static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo
out->mi_meta_sign[1] = unaligned_peek_u64(4, meta1->sign);
out->mi_meta_txnid[2] = troika->txnid[2];
out->mi_meta_sign[2] = unaligned_peek_u64(4, meta2->sign);
if (likely(bytes > size_before_bootid)) {
memcpy(&out->mi_bootid.meta[0], &meta0->bootid, 16);
memcpy(&out->mi_bootid.meta[1], &meta1->bootid, 16);
memcpy(&out->mi_bootid.meta[2], &meta2->bootid, 16);
if (likely(bytes > size_before_dxbid))
memcpy(&out->mi_dxbid, &meta0->dxbid, 16);
}
memcpy(&out->mi_bootid.meta[0], &meta0->bootid, 16);
memcpy(&out->mi_bootid.meta[1], &meta1->bootid, 16);
memcpy(&out->mi_bootid.meta[2], &meta2->bootid, 16);
memcpy(&out->mi_dxbid, &meta0->dxbid, 16);
const volatile meta_t *txn_meta = head.ptr_v;
out->mi_last_pgno = txn_meta->geometry.first_unallocated - 1;
@@ -740,44 +790,38 @@ static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo
out->mi_maxreaders = env->max_readers;
out->mi_numreaders = env->lck_mmap.lck ? atomic_load32(&lck->rdt_length, mo_Relaxed) : INT32_MAX;
out->mi_dxb_pagesize = env->ps;
out->mi_sys_pagesize = globals.sys_pagesize;
if (likely(bytes > size_before_bootid)) {
const uint64_t unsynced_pages =
atomic_load64(&lck->unsynced_pages, mo_Relaxed) +
((uint32_t)out->mi_recent_txnid != atomic_load32(&lck->meta_sync_txnid, mo_Relaxed));
out->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages);
const uint64_t monotime_now = osal_monotime();
uint64_t ts = atomic_load64(&lck->eoos_timestamp, mo_Relaxed);
out->mi_since_sync_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0;
ts = atomic_load64(&lck->readers_check_timestamp, mo_Relaxed);
out->mi_since_reader_check_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0;
out->mi_autosync_threshold = pgno2bytes(env, atomic_load32(&lck->autosync_threshold, mo_Relaxed));
out->mi_autosync_period_seconds16dot16 =
osal_monotime_to_16dot16_noUnderflow(atomic_load64(&lck->autosync_period, mo_Relaxed));
out->mi_bootid.current.x = globals.bootid.x;
out->mi_bootid.current.y = globals.bootid.y;
out->mi_mode = env->lck_mmap.lck ? lck->envmode.weak : env->flags;
}
const uint64_t unsynced_pages = atomic_load64(&lck->unsynced_pages, mo_Relaxed) +
((uint32_t)out->mi_recent_txnid != atomic_load32(&lck->meta_sync_txnid, mo_Relaxed));
out->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages);
const uint64_t monotime_now = osal_monotime();
uint64_t ts = atomic_load64(&lck->eoos_timestamp, mo_Relaxed);
out->mi_since_sync_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0;
ts = atomic_load64(&lck->readers_check_timestamp, mo_Relaxed);
out->mi_since_reader_check_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0;
out->mi_autosync_threshold = pgno2bytes(env, atomic_load32(&lck->autosync_threshold, mo_Relaxed));
out->mi_autosync_period_seconds16dot16 =
osal_monotime_to_16dot16_noUnderflow(atomic_load64(&lck->autosync_period, mo_Relaxed));
out->mi_bootid.current.x = globals.bootid.x;
out->mi_bootid.current.y = globals.bootid.y;
out->mi_mode = env->lck_mmap.lck ? lck->envmode.weak : env->flags;
if (likely(bytes > size_before_pgop_stat)) {
#if MDBX_ENABLE_PGOP_STAT
out->mi_pgop_stat.newly = atomic_load64(&lck->pgops.newly, mo_Relaxed);
out->mi_pgop_stat.cow = atomic_load64(&lck->pgops.cow, mo_Relaxed);
out->mi_pgop_stat.clone = atomic_load64(&lck->pgops.clone, mo_Relaxed);
out->mi_pgop_stat.split = atomic_load64(&lck->pgops.split, mo_Relaxed);
out->mi_pgop_stat.merge = atomic_load64(&lck->pgops.merge, mo_Relaxed);
out->mi_pgop_stat.spill = atomic_load64(&lck->pgops.spill, mo_Relaxed);
out->mi_pgop_stat.unspill = atomic_load64(&lck->pgops.unspill, mo_Relaxed);
out->mi_pgop_stat.wops = atomic_load64(&lck->pgops.wops, mo_Relaxed);
out->mi_pgop_stat.prefault = atomic_load64(&lck->pgops.prefault, mo_Relaxed);
out->mi_pgop_stat.mincore = atomic_load64(&lck->pgops.mincore, mo_Relaxed);
out->mi_pgop_stat.msync = atomic_load64(&lck->pgops.msync, mo_Relaxed);
out->mi_pgop_stat.fsync = atomic_load64(&lck->pgops.fsync, mo_Relaxed);
out->mi_pgop_stat.newly = atomic_load64(&lck->pgops.newly, mo_Relaxed);
out->mi_pgop_stat.cow = atomic_load64(&lck->pgops.cow, mo_Relaxed);
out->mi_pgop_stat.clone = atomic_load64(&lck->pgops.clone, mo_Relaxed);
out->mi_pgop_stat.split = atomic_load64(&lck->pgops.split, mo_Relaxed);
out->mi_pgop_stat.merge = atomic_load64(&lck->pgops.merge, mo_Relaxed);
out->mi_pgop_stat.spill = atomic_load64(&lck->pgops.spill, mo_Relaxed);
out->mi_pgop_stat.unspill = atomic_load64(&lck->pgops.unspill, mo_Relaxed);
out->mi_pgop_stat.wops = atomic_load64(&lck->pgops.wops, mo_Relaxed);
out->mi_pgop_stat.prefault = atomic_load64(&lck->pgops.prefault, mo_Relaxed);
out->mi_pgop_stat.mincore = atomic_load64(&lck->pgops.mincore, mo_Relaxed);
out->mi_pgop_stat.msync = atomic_load64(&lck->pgops.msync, mo_Relaxed);
out->mi_pgop_stat.fsync = atomic_load64(&lck->pgops.fsync, mo_Relaxed);
#else
memset(&out->mi_pgop_stat, 0, sizeof(out->mi_pgop_stat));
memset(&out->mi_pgop_stat, 0, sizeof(out->mi_pgop_stat));
#endif /* MDBX_ENABLE_PGOP_STAT*/
}
txnid_t overall_latter_reader_txnid = out->mi_recent_txnid;
txnid_t self_latter_reader_txnid = overall_latter_reader_txnid;
@@ -800,22 +844,21 @@ static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo
return MDBX_SUCCESS;
}
__cold int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, size_t bytes, troika_t *troika) {
__cold int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, troika_t *troika) {
MDBX_envinfo snap;
int rc = env_info_snap(env, txn, &snap, sizeof(snap), troika);
int rc = env_info_snap(env, txn, &snap, troika);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
eASSERT(env, sizeof(snap) >= bytes);
while (1) {
rc = env_info_snap(env, txn, out, bytes, troika);
rc = env_info_snap(env, txn, out, troika);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
snap.mi_since_sync_seconds16dot16 = out->mi_since_sync_seconds16dot16;
snap.mi_since_reader_check_seconds16dot16 = out->mi_since_reader_check_seconds16dot16;
if (likely(memcmp(&snap, out, bytes) == 0))
if (likely(memcmp(&snap, out, sizeof(MDBX_envinfo)) == 0))
return MDBX_SUCCESS;
memcpy(&snap, out, bytes);
memcpy(&snap, out, sizeof(MDBX_envinfo));
}
}
@@ -823,11 +866,7 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_envin
if (unlikely((env == nullptr && txn == nullptr) || arg == nullptr))
return LOG_IFERR(MDBX_EINVAL);
const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid);
const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat);
const size_t size_before_dxbid = offsetof(MDBX_envinfo, mi_dxbid);
if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && bytes != size_before_pgop_stat &&
bytes != size_before_dxbid)
if (unlikely(bytes != sizeof(MDBX_envinfo)))
return LOG_IFERR(MDBX_EINVAL);
if (txn) {
@@ -846,7 +885,7 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_envin
}
troika_t troika;
return LOG_IFERR(env_info(env, txn, arg, bytes, &troika));
return LOG_IFERR(env_info(env, txn, arg, &troika));
}
__cold int mdbx_preopen_snapinfo(const char *pathname, MDBX_envinfo *out, size_t bytes) {
@@ -865,27 +904,18 @@ __cold int mdbx_preopen_snapinfoW(const wchar_t *pathname, MDBX_envinfo *out, si
if (unlikely(!out))
return LOG_IFERR(MDBX_EINVAL);
const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid);
const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat);
const size_t size_before_dxbid = offsetof(MDBX_envinfo, mi_dxbid);
if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && bytes != size_before_pgop_stat &&
bytes != size_before_dxbid)
if (unlikely(bytes != sizeof(MDBX_envinfo)))
return LOG_IFERR(MDBX_EINVAL);
memset(out, 0, bytes);
if (likely(bytes > size_before_bootid)) {
out->mi_bootid.current.x = globals.bootid.x;
out->mi_bootid.current.y = globals.bootid.y;
}
MDBX_env env;
memset(&env, 0, sizeof(env));
env.pid = osal_getpid();
if (unlikely(!is_powerof2(globals.sys_pagesize) || globals.sys_pagesize < MDBX_MIN_PAGESIZE)) {
ERROR("unsuitable system pagesize %u", globals.sys_pagesize);
return LOG_IFERR(MDBX_INCOMPATIBLE);
}
out->mi_sys_pagesize = globals.sys_pagesize;
memset(out, 0, bytes);
MDBX_env env;
memset(&env, 0, sizeof(env));
env.pid = osal_getpid();
env.flags = MDBX_RDONLY | MDBX_NORDAHEAD | MDBX_ACCEDE | MDBX_VALIDATION;
env.stuck_meta = -1;
env.lck_mmap.fd = INVALID_HANDLE_VALUE;
@@ -894,11 +924,12 @@ __cold int mdbx_preopen_snapinfoW(const wchar_t *pathname, MDBX_envinfo *out, si
env.fd4meta = INVALID_HANDLE_VALUE;
#if defined(_WIN32) || defined(_WIN64)
env.dxb_lock_event = INVALID_HANDLE_VALUE;
env.lck_lock_event = INVALID_HANDLE_VALUE;
env.ioring.overlapped_fd = INVALID_HANDLE_VALUE;
#endif /* Windows */
env_options_init(&env);
int rc = env_handle_pathname(&env, pathname, 0);
int err, rc = env_handle_pathname(&env, pathname, 0);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
rc = osal_openfile(MDBX_OPEN_DXB_READ, &env, env.pathname.dxb, &env.lazy_fd, 0);
@@ -918,17 +949,17 @@ __cold int mdbx_preopen_snapinfoW(const wchar_t *pathname, MDBX_envinfo *out, si
out->mi_geo.current = pgno2bytes(&env, header.geometry.now);
out->mi_last_pgno = header.geometry.first_unallocated - 1;
const unsigned n = 0;
out->mi_recent_txnid = constmeta_txnid(&header);
const unsigned n = 0;
out->mi_meta_sign[n] = unaligned_peek_u64(4, &header.sign);
if (likely(bytes > size_before_bootid)) {
memcpy(&out->mi_bootid.meta[n], &header.bootid, 16);
if (likely(bytes > size_before_dxbid))
memcpy(&out->mi_dxbid, &header.dxbid, 16);
}
memcpy(&out->mi_bootid.meta[n], &header.bootid, 16);
memcpy(&out->mi_dxbid, &header.dxbid, 16);
bailout:
env_close(&env, false);
err = env_info_sys(&env, out);
rc = rc ? rc : err;
err = env_close(&env, false);
rc = rc ? rc : err;
return LOG_IFERR(rc);
}
@@ -1100,13 +1131,16 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
}
const size_t unit_ps = (globals.sys_pagesize > (size_t)pagesize) ? globals.sys_pagesize : (size_t)pagesize;
const size_t unit_ag = (globals.sys_allocation_granularity > unit_ps) ? globals.sys_allocation_granularity : unit_ps;
const size_t unit_ag = (globals.sys_allocation_granularity > unit_ps &&
(growth_step < 0 || (size_t)growth_step >= globals.sys_allocation_granularity))
? globals.sys_allocation_granularity
: unit_ps;
size_lower = ceil_powerof2(size_lower, unit_ps);
size_upper = ceil_powerof2(size_upper, unit_ag);
size_now = ceil_powerof2(size_now, unit_ag);
/* LY: подбираем значение size_upper:
* - кратное размеру страницы
* - кратное размеру unit_ag (размеру страницы БД и системному размеру выделения)
* - без нарушения MAX_MAPSIZE и MAX_PAGENO */
while (unlikely((size_t)size_upper > MAX_MAPSIZE || (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) {
if ((size_t)size_upper < unit_ag + MIN_MAPSIZE || (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) {

View File

@@ -163,3 +163,29 @@ int mdbx_txn_unlock(MDBX_env *env) {
lck_txn_unlock(env);
return MDBX_SUCCESS;
}
/*------------------------------------------------------------------------------
* Auxiliary */
__cold const char *mdbx_ratio2digits(uint64_t numerator, uint64_t denominator, int precision, char *buffer,
size_t buffer_size) {
if (!buffer)
return "nullptr";
else if (buffer_size < sizeof(ratio2digits_buffer_t))
return "buffer-to-small";
else if (!denominator)
return numerator ? "infinity" : "undefined";
else
return ratio2digits(numerator, denominator, (ratio2digits_buffer_t *)buffer, precision);
}
__cold const char *mdbx_ratio2percents(uint64_t value, uint64_t whole, char *buffer, size_t buffer_size) {
if (!buffer)
return "nullptr";
else if (buffer_size < sizeof(ratio2digits_buffer_t))
return "buffer-to-small";
else if (!whole)
return value ? "infinity" : "undefined";
else
return ratio2percent(value, whole, (ratio2digits_buffer_t *)buffer);
}

View File

@@ -124,7 +124,7 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) {
const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift;
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
#if !defined(_MSC_VER) || defined(_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
#if !defined(_MSC_VER) || !MDBX_WITHOUT_MSVC_CRT /* Workaround for MSVC error LNK2019: unresolved external \
symbol __except1 referenced in function __ftol3_except */
assert(key == mdbx_key_from_double((double)json_integer));
#endif /* Workaround for MSVC */
@@ -146,7 +146,7 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) {
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
const uint64_t key =
bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
#if !defined(_MSC_VER) || defined(_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
#if !defined(_MSC_VER) || !MDBX_WITHOUT_MSVC_CRT /* Workaround for MSVC error LNK2019: unresolved external \
symbol __except1 referenced in function __ftol3_except */
assert(key == mdbx_key_from_double((double)json_integer));
#endif /* Workaround for MSVC */

View File

@@ -37,7 +37,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, uint64_t in
return LOG_IFERR(rc);
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
rc = tbl_fetch(txn, dbi);
rc = tbl_refresh_absent2baddbi(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
@@ -146,8 +146,7 @@ __cold const char *mdbx_liberr2str(int errnum) {
" or Operation system not supported such operations",
"MDBX_INCOMPATIBLE: Environment or database is not compatible"
" with the requested operation or the specified flags",
"MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot,"
" e.g. read-transaction already run for current thread",
"MDBX_BAD_RSLOT: Reader locktable slot was unexpectly reused or cleared by an enemy thread",
"MDBX_BAD_TXN: Transaction is not valid for requested operation,"
" e.g. had errored and be must aborted, has a child, or is invalid",
"MDBX_BAD_VALSIZE: Invalid size or alignment of key or data"

View File

@@ -7,8 +7,7 @@
/* LY: avoid tsan-trap by txn, mm_last_pg and geo.first_unallocated */
__attribute__((__no_sanitize_thread__, __noinline__))
#endif
int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
{
int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
if (likely(rc == MDBX_SUCCESS))
rc = check_env(txn->env, true);

View File

@@ -7,6 +7,33 @@
#ifndef __cplusplus
MDBX_MAYBE_UNUSED static __always_inline void atomic_yield(void) {
#if defined(_WIN32) || defined(_WIN64)
YieldProcessor();
#elif defined(__ia32__) || defined(__e2k__)
__builtin_ia32_pause();
#elif defined(__ia64__)
#if defined(__HP_cc__) || defined(__HP_aCC__)
_Asm_hint(_HINT_PAUSE);
#else
__asm__ __volatile__("hint @pause");
#endif
#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || defined(__ARM_ARCH_6K__)
#ifdef __CC_ARM
__yield();
#else
__asm__ __volatile__("yield");
#endif
#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && __mips_isa_rev >= 2
__asm__ __volatile__("pause");
#elif defined(__mips) || defined(__mips__) || defined(__mips64) || defined(__mips64__) || defined(_M_MRX000) || \
defined(_MIPS_) || defined(__MWERKS__) || defined(__sgi)
__asm__ __volatile__(".word 0x00000140");
#else
osal_yield();
#endif
}
#ifdef MDBX_HAVE_C11ATOMICS
#define osal_memory_fence(order, write) atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order))
#else /* MDBX_HAVE_C11ATOMICS */
@@ -115,8 +142,7 @@ MDBX_MAYBE_UNUSED static
#if MDBX_64BIT_ATOMIC
__always_inline
#endif /* MDBX_64BIT_ATOMIC */
uint64_t
atomic_load64(const volatile mdbx_atomic_uint64_t *p, enum mdbx_memory_order order) {
uint64_t atomic_load64(const volatile mdbx_atomic_uint64_t *p, enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8);
#if MDBX_64BIT_ATOMIC
#ifdef MDBX_HAVE_C11ATOMICS
@@ -144,38 +170,12 @@ MDBX_MAYBE_UNUSED static
if (likely(value == again))
return value;
value = again;
atomic_yield();
}
#endif /* !MDBX_64BIT_ATOMIC */
}
#endif /* atomic_load64 */
MDBX_MAYBE_UNUSED static __always_inline void atomic_yield(void) {
#if defined(_WIN32) || defined(_WIN64)
YieldProcessor();
#elif defined(__ia32__) || defined(__e2k__)
__builtin_ia32_pause();
#elif defined(__ia64__)
#if defined(__HP_cc__) || defined(__HP_aCC__)
_Asm_hint(_HINT_PAUSE);
#else
__asm__ __volatile__("hint @pause");
#endif
#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || defined(__ARM_ARCH_6K__)
#ifdef __CC_ARM
__yield();
#else
__asm__ __volatile__("yield");
#endif
#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && __mips_isa_rev >= 2
__asm__ __volatile__("pause");
#elif defined(__mips) || defined(__mips__) || defined(__mips64) || defined(__mips64__) || defined(_M_MRX000) || \
defined(_MIPS_) || defined(__MWERKS__) || defined(__sgi)
__asm__ __volatile__(".word 0x00000140");
#else
osal_yield();
#endif
}
#if MDBX_64BIT_CAS
MDBX_MAYBE_UNUSED static __always_inline bool atomic_cas64(mdbx_atomic_uint64_t *p, uint64_t c, uint64_t v) {
#ifdef MDBX_HAVE_C11ATOMICS
@@ -312,10 +312,11 @@ MDBX_MAYBE_UNUSED static __always_inline void safe64_write(mdbx_atomic_uint64_t
MDBX_MAYBE_UNUSED static __always_inline uint64_t safe64_read(const mdbx_atomic_uint64_t *p) {
jitter4testing(true);
uint64_t v;
do
uint64_t v = atomic_load64(p, mo_AcquireRelease);
while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak)) {
atomic_yield();
v = atomic_load64(p, mo_AcquireRelease);
while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak));
}
return v;
}
@@ -353,8 +354,7 @@ MDBX_MAYBE_UNUSED static
#if MDBX_64BIT_ATOMIC
__always_inline
#endif /* MDBX_64BIT_ATOMIC */
void
safe64_inc(mdbx_atomic_uint64_t *p, const uint64_t v) {
void safe64_inc(mdbx_atomic_uint64_t *p, const uint64_t v) {
assert(v > 0);
safe64_update(p, safe64_read(p) + v);
}

320
src/chk.c
View File

@@ -196,6 +196,49 @@ __cold static MDBX_chk_line_t *chk_print_size(MDBX_chk_line_t *line, const char
return line;
}
__cold static MDBX_chk_line_t *chk_print_ratio(MDBX_chk_line_t *line, size_t numerator, size_t denominator,
unsigned precision) {
if (line) {
ratio2digits_buffer_t buffer;
line = chk_puts(line, ratio2digits(numerator, denominator, &buffer, precision));
}
return line;
}
__cold static MDBX_chk_line_t *chk_print_percent(MDBX_chk_line_t *line, const char *triplet, size_t value, size_t whole,
const char *unit) {
if (line) {
const char *s1 = triplet;
const char *s2 = s1 + strlen(s1) + 1;
const char *s3 = s2 + strlen(s2) + 1;
ratio2digits_buffer_t buffer;
line = chk_print(line, "%s %" PRIuSIZE "%s%s (%s%%%s)%s", s1, value, unit, &"s"[*unit == 0 || value == 1],
ratio2percent(value, whole, &buffer), s2, s3);
}
return line;
}
__cold static MDBX_chk_line_t *chk_print_pages_percent(MDBX_chk_line_t *line, const char *triplet, size_t pages,
size_t whole) {
return chk_print_percent(line, triplet, pages, whole, " page");
}
__cold static MDBX_chk_line_t *chk_print_bytes_percent(MDBX_chk_line_t *line, const char *triplet, size_t pages,
size_t whole) {
return chk_print_percent(line, triplet, pages, whole, " byte");
}
__cold static MDBX_chk_line_t *chk_print_pages_percent_bb(MDBX_chk_line_t *line, const char *prefix, size_t pages,
size_t backed, size_t boundary) {
if (line) {
ratio2digits_buffer_t buffer_backed, buffer_boundary;
line =
chk_print(line, "%s %" PRIuSIZE " page%s (%s%% of backed, %s%% of boundary)", prefix, pages, &"s"[pages == 1],
ratio2percent(pages, backed, &buffer_backed), ratio2percent(pages, boundary, &buffer_boundary));
}
return line;
}
__cold static int chk_error_rc(MDBX_chk_scope_t *const scope, int err, const char *subj) {
MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error);
if (line)
@@ -568,11 +611,23 @@ static void histogram_acc(const size_t n, struct MDBX_chk_histogram *p) {
__cold static MDBX_chk_line_t *histogram_dist(MDBX_chk_line_t *line, const struct MDBX_chk_histogram *histogram,
const char *prefix, const char *first, bool amount) {
/* https://en.wikipedia.org/wiki/Multiplication_sign */
#if defined(unix) || defined(linux) || defined(__unix__) || defined(__unix) || defined(__linux__) || \
defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE)
#define UNICODE_MULSIGN_STR "×"
#define UNICODE_MULSIGN_FMT "s"
#elif defined(_WIN32) || defined(_WIN64)
#define UNICODE_MULSIGN_STR L"\u00d7"
#define UNICODE_MULSIGN_FMT "ls"
#else
#define UNICODE_MULSIGN_STR "*"
#define UNICODE_MULSIGN_FMT "s"
#endif
line = chk_print(line, "%s:", prefix);
const char *comma = "";
const size_t first_val = amount ? histogram->ones : histogram->pad;
if (first_val) {
chk_print(line, " %s=%" PRIuSIZE, first, first_val);
chk_print(line, " %s%" UNICODE_MULSIGN_FMT "%" PRIuSIZE, first, UNICODE_MULSIGN_STR, first_val);
comma = ",";
}
for (size_t n = 0; n < ARRAY_LENGTH(histogram->ranges); ++n)
@@ -580,7 +635,8 @@ __cold static MDBX_chk_line_t *histogram_dist(MDBX_chk_line_t *line, const struc
chk_print(line, "%s %" PRIuSIZE, comma, histogram->ranges[n].begin);
if (histogram->ranges[n].begin != histogram->ranges[n].end - 1)
chk_print(line, "-%" PRIuSIZE, histogram->ranges[n].end - 1);
line = chk_print(line, "=%" PRIuSIZE, amount ? histogram->ranges[n].amount : histogram->ranges[n].count);
line = chk_print(line, "%" UNICODE_MULSIGN_FMT "%" PRIuSIZE, UNICODE_MULSIGN_STR,
amount ? histogram->ranges[n].amount : histogram->ranges[n].count);
comma = ",";
}
return line;
@@ -668,7 +724,7 @@ __cold static void chk_verbose_meta(MDBX_chk_scope_t *const scope, const unsigne
line = chk_print(line, " txn#%" PRIaTXN ", ", meta_txnid);
if (chk->envinfo.mi_bootid.meta[num].x | chk->envinfo.mi_bootid.meta[num].y)
line = chk_print(line, "boot-id %" PRIx64 "-%" PRIx64 " (%s)", chk->envinfo.mi_bootid.meta[num].x,
chk->envinfo.mi_bootid.meta[num].y, bootid_match ? "live" : "not match");
chk->envinfo.mi_bootid.meta[num].y, bootid_match ? "live" : "dissimilar");
else
line = chk_puts(line, "no boot-id");
@@ -701,7 +757,8 @@ __cold static int chk_pgvisitor(const size_t pgno, const unsigned npages, void *
chk_scope_issue(scope, "too deeply %u, page %zu, parent %zu", deep, pgno, parent_pgno);
return MDBX_CORRUPTED /* avoid infinite loop/recursion */;
}
histogram_acc(deep, &tbl->histogram.deep);
if (pagetype != page_large)
histogram_acc(deep, &tbl->histogram.height);
usr->result.processed_pages += npages;
const size_t page_bytes = payload_bytes + header_bytes + unused_bytes;
@@ -722,27 +779,28 @@ __cold static int chk_pgvisitor(const size_t pgno, const unsigned npages, void *
const char *pagetype_caption;
bool branch = false;
struct MDBX_chk_histogram *filling = nullptr;
struct MDBX_chk_histogram *density = nullptr;
switch (pagetype) {
default:
chk_object_issue(scope, "page", pgno, "unknown page-type", "type %u, deep %i, parent %zu", (unsigned)pagetype, deep,
parent_pgno);
pagetype_caption = "unknown";
tbl->pages.other += npages;
tbl->pages.broken += npages;
break;
case page_broken:
assert(page_err != MDBX_SUCCESS);
pagetype_caption = "broken";
tbl->pages.other += npages;
tbl->pages.broken += npages;
break;
case page_sub_broken:
assert(page_err != MDBX_SUCCESS);
pagetype_caption = "broken-subpage";
tbl->pages.other += npages;
tbl->pages.broken += npages;
break;
case page_large:
pagetype_caption = "large";
histogram_acc(npages, &tbl->histogram.large_pages);
density = &tbl->histogram.large_or_nested_density;
if (tbl->flags & MDBX_DUPSORT)
chk_object_issue(scope, "page", pgno, "unexpected", "type %u, table %s flags 0x%x, deep %i, parent %zu",
(unsigned)pagetype, chk_v2a(chk, &tbl->name), tbl->flags, deep, parent_pgno);
@@ -752,11 +810,11 @@ __cold static int chk_pgvisitor(const size_t pgno, const unsigned npages, void *
if (!nested) {
pagetype_caption = "branch";
tbl->pages.branch += 1;
filling = &tbl->histogram.tree_filling;
density = &tbl->histogram.tree_density;
} else {
pagetype_caption = "nested-branch";
tbl->pages.nested_branch += 1;
filling = &tbl->histogram.nested_tree_filling;
density = &tbl->histogram.large_or_nested_density;
}
break;
case page_dupfix_leaf:
@@ -769,16 +827,16 @@ __cold static int chk_pgvisitor(const size_t pgno, const unsigned npages, void *
if (!nested) {
pagetype_caption = "leaf";
tbl->pages.leaf += 1;
filling = &tbl->histogram.tree_filling;
density = &tbl->histogram.tree_density;
if (height != tbl_info->internal->height)
chk_object_issue(scope, "page", pgno, "wrong tree height", "actual %i != %i table %s, parent %zu", height,
tbl_info->internal->height, chk_v2a(chk, &tbl->name), parent_pgno);
} else {
pagetype_caption = (pagetype == page_leaf) ? "nested-leaf" : "nested-leaf-dupfix";
tbl->pages.nested_leaf += 1;
filling = &tbl->histogram.nested_tree_filling;
density = &tbl->histogram.large_or_nested_density;
if (chk->last_nested != nested) {
histogram_acc(height, &tbl->histogram.nested_tree);
histogram_acc(height, &tbl->histogram.nested_height);
chk->last_nested = nested;
}
if (height != nested->height)
@@ -794,12 +852,12 @@ __cold static int chk_pgvisitor(const size_t pgno, const unsigned npages, void *
chk_object_issue(scope, "page", pgno, "unexpected", "type %u, table %s flags 0x%x, deep %i, parent %zu",
(unsigned)pagetype, chk_v2a(chk, &tbl->name), tbl->flags, deep, parent_pgno);
else
filling = &tbl->histogram.nested_tree_filling;
density = &tbl->histogram.large_or_nested_density;
break;
}
if (filling)
histogram_acc((page_size - unused_bytes) * 100 / page_size, filling);
if (density)
histogram_acc((page_size - unused_bytes) * 100 / page_size, density);
if (npages) {
if (tbl->cookie) {
@@ -914,7 +972,7 @@ __cold static int chk_tree(MDBX_chk_scope_t *const scope) {
total.lost_bytes += tbl->lost_bytes;
total.pages.all += tbl->pages.all;
total.pages.empty += tbl->pages.empty;
total.pages.other += tbl->pages.other;
total.pages.broken += tbl->pages.broken;
total.pages.branch += tbl->pages.branch;
total.pages.leaf += tbl->pages.leaf;
total.pages.nested_branch += tbl->pages.nested_branch;
@@ -934,53 +992,55 @@ __cold static int chk_tree(MDBX_chk_scope_t *const scope) {
if (scope->verbosity > MDBX_chk_info) {
for (size_t i = 0; i < ARRAY_LENGTH(chk->table) && chk->table[i]; ++i) {
MDBX_chk_table_t *const tbl = chk->table[i];
MDBX_chk_scope_t *inner = chk_scope_push(scope, 0, "tree %s:", chk_v2a(chk, &tbl->name));
if (tbl->pages.all == 0)
chk_line_end(chk_print(chk_line_begin(inner, MDBX_chk_resolution), "empty"));
else {
MDBX_chk_scope_t *inner =
chk_scope_push(scope, 0, (tbl->pages.all ? "b-tree %s, subtotal %" PRIuSIZE " pages:" : "b-tree %s: empty"),
chk_v2a(chk, &tbl->name), tbl->pages.all);
if (tbl->pages.all) {
MDBX_chk_line_t *line = chk_line_begin(inner, MDBX_chk_info);
if (line) {
line = chk_print(line, "page usage: subtotal %" PRIuSIZE, tbl->pages.all);
const size_t branch_pages = tbl->pages.branch + tbl->pages.nested_branch;
const size_t leaf_pages = tbl->pages.leaf + tbl->pages.nested_leaf + tbl->pages.nested_subleaf;
if (tbl->pages.other)
line = chk_print(line, ", other %" PRIuSIZE, tbl->pages.other);
if (tbl->pages.other == 0 || (branch_pages | leaf_pages | tbl->histogram.large_pages.count) != 0) {
line = chk_print(line, ", branch %" PRIuSIZE ", leaf %" PRIuSIZE, branch_pages, leaf_pages);
line = chk_puts(line, "pages composition: ");
if (tbl->pages.broken)
line = chk_print(line, "broken %" PRIuSIZE ", ", tbl->pages.broken);
if (tbl->pages.broken != tbl->pages.all) {
line = chk_print(line, "branch %" PRIuSIZE ", leaf %" PRIuSIZE, tbl->pages.branch, tbl->pages.leaf);
if (tbl->pages.nested_subleaf || (tbl->flags & MDBX_DUPSORT) != 0)
line = chk_print(line, ", subleaf %" PRIuSIZE, tbl->pages.nested_subleaf);
if (tbl->pages.nested_branch || (tbl->flags & MDBX_DUPSORT) != 0)
line = chk_print(line, ", nested-branch %" PRIuSIZE, tbl->pages.nested_branch);
if (tbl->pages.nested_leaf || (tbl->flags & MDBX_DUPSORT) != 0)
line = chk_print(line, ", nested-leaf %" PRIuSIZE, tbl->pages.nested_leaf);
if (tbl->histogram.large_pages.count || (tbl->flags & MDBX_DUPSORT) == 0) {
line = chk_print(line, ", large %" PRIuSIZE, tbl->histogram.large_pages.count);
if (tbl->histogram.large_pages.amount | tbl->histogram.large_pages.count)
line = histogram_print(inner, line, &tbl->histogram.large_pages, " amount", "single", true);
}
}
line = histogram_dist(chk_line_feed(line), &tbl->histogram.deep, "tree deep density", "1", false);
if (tbl != &chk->table_gc && tbl->histogram.nested_tree.count) {
line = chk_print(chk_line_feed(line), "nested tree(s) %" PRIuSIZE, tbl->histogram.nested_tree.count);
line = histogram_dist(line, &tbl->histogram.nested_tree, " density", "1", false);
line = chk_print(chk_line_feed(line),
"nested tree(s) pages %" PRIuSIZE ": branch %" PRIuSIZE ", leaf %" PRIuSIZE
", subleaf %" PRIuSIZE,
tbl->pages.nested_branch + tbl->pages.nested_leaf, tbl->pages.nested_branch,
tbl->pages.nested_leaf, tbl->pages.nested_subleaf);
}
line = histogram_dist(chk_line_feed(line), &tbl->histogram.height, "tree levels", "1", false);
if ((tbl->flags & MDBX_DUPSORT) != 0 || (tbl->histogram.nested_height.count && tbl != &chk->table_gc)) {
line = chk_print(chk_line_feed(line),
"nested tree(s): quantity %" PRIuSIZE ", subtotal pages %" PRIuSIZE ", ",
tbl->histogram.nested_height.count, tbl->pages.nested_branch + tbl->pages.nested_leaf);
if (tbl != &chk->table_gc && tbl->histogram.nested_height.count)
line = histogram_dist(line, &tbl->histogram.nested_height, "levels", "1", false);
}
line = chk_line_feed(line);
const size_t bytes = pgno2bytes(env, tbl->pages.all);
line =
chk_print(chk_line_feed(line),
"page filling: subtotal %" PRIuSIZE " bytes (%.1f%%), payload %" PRIuSIZE
" (%.1f%%), unused %" PRIuSIZE " (%.1f%%)",
bytes, bytes * 100.0 / total_page_bytes, tbl->payload_bytes, tbl->payload_bytes * 100.0 / bytes,
bytes - tbl->payload_bytes, (bytes - tbl->payload_bytes) * 100.0 / bytes);
if (tbl->pages.empty)
line = chk_print(line, ", %" PRIuSIZE " empty pages", tbl->pages.empty);
if (tbl->lost_bytes)
line = chk_print(line, ", %" PRIuSIZE " bytes lost", tbl->lost_bytes);
const size_t bytes = pgno2bytes(env, tbl->pages.all);
line = chk_print_bytes_percent(line, "pages density: subtotal\0\0", bytes, total_page_bytes);
line = chk_print_percent(line, ", payload\0\0", tbl->payload_bytes, bytes, "");
line = chk_print_percent(line, ", unused\0\0", bytes - tbl->payload_bytes, bytes, "");
if (tbl->pages.empty)
line = chk_print(line, ", %" PRIuSIZE " empty pages", tbl->pages.empty);
if (tbl->lost_bytes)
line = chk_print(line, ", %" PRIuSIZE " bytes lost", tbl->lost_bytes);
line =
histogram_dist(chk_line_feed(line), &tbl->histogram.tree_filling, "tree %-filling density", "1", false);
if (tbl->histogram.nested_tree_filling.count)
line = histogram_dist(chk_line_feed(line), &tbl->histogram.nested_tree_filling,
"nested tree(s) %-filling density", "1", false);
line = histogram_dist(chk_line_feed(line), &tbl->histogram.tree_density, "pages %-density distribution",
"1", false);
if (tbl->histogram.large_or_nested_density.count)
line = histogram_dist(chk_line_feed(line), &tbl->histogram.large_or_nested_density,
(tbl->flags & MDBX_DUPSORT) ? "nested %-density distribution"
: "large pages %-density distribution",
"1", false);
}
chk_line_end(line);
}
}
@@ -989,14 +1049,11 @@ __cold static int chk_tree(MDBX_chk_scope_t *const scope) {
}
MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution);
line = chk_print(line,
"summary: total %" PRIuSIZE " bytes, payload %" PRIuSIZE " (%.1f%%), unused %" PRIuSIZE " (%.1f%%),"
" average fill %.1f%%",
total_page_bytes, usr->result.total_payload_bytes,
usr->result.total_payload_bytes * 100.0 / total_page_bytes,
total_page_bytes - usr->result.total_payload_bytes,
(total_page_bytes - usr->result.total_payload_bytes) * 100.0 / total_page_bytes,
usr->result.total_payload_bytes * 100.0 / total_page_bytes);
line = chk_print(line, "summary: total %" PRIuSIZE " bytes", total_page_bytes);
line =
chk_print_percent(line, ", payload\0 average density\0", usr->result.total_payload_bytes, total_page_bytes, "");
line = chk_print_percent(line, ", unused\0 average sparsity\0", total_page_bytes - usr->result.total_payload_bytes,
total_page_bytes, "");
if (total.pages.empty)
line = chk_print(line, ", %" PRIuSIZE " empty pages", total.pages.empty);
if (total.lost_bytes)
@@ -1285,15 +1342,16 @@ bailout:
if (handler) {
if (record_count) {
MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info);
line = histogram_dist(line, &tbl->histogram.key_len, "key length density", "0/1", false);
line = histogram_dist(line, &tbl->histogram.key_len, "key length distribution", "0/1", false);
chk_line_feed(line);
line = histogram_dist(line, &tbl->histogram.val_len, "value length density", "0/1", false);
line = histogram_dist(line, &tbl->histogram.val_len, "value length distribution", "0/1", false);
if (tbl->histogram.multival.amount) {
chk_line_feed(line);
line = histogram_dist(line, &tbl->histogram.multival, "number of multi-values density", "single", false);
line = histogram_dist(line, &tbl->histogram.multival, "number of multi-values distribution", "single", false);
chk_line_feed(line);
line = chk_print(line, "number of keys %" PRIuSIZE ", average values per key %.1f",
tbl->histogram.multival.count, record_count / (double)tbl->histogram.multival.count);
line =
chk_print(line, "number of keys %" PRIuSIZE ", average values per key ", tbl->histogram.multival.count);
line = chk_print_ratio(line, record_count, tbl->histogram.multival.count, 1);
}
chk_line_end(line);
}
@@ -1302,16 +1360,20 @@ bailout:
if (chk->cb->table_conclude)
err = chk->cb->table_conclude(usr, tbl, cursor, err);
MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution);
line = chk_print(line, "summary: %" PRIuSIZE " records,", record_count);
if (dups || (tbl->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)))
line = chk_print(line, " %" PRIuSIZE " dups,", dups);
if (sub_databases || dbi == MAIN_DBI)
line = chk_print(line, " %" PRIuSIZE " tables,", sub_databases);
line = chk_print(line,
" %" PRIuSIZE " key's bytes,"
" %" PRIuSIZE " data's bytes,"
" %" PRIuSIZE " problem(s)",
tbl->histogram.key_len.amount, tbl->histogram.val_len.amount, scope->subtotal_issues);
if (record_count | tbl->histogram.key_len.amount | tbl->histogram.val_len.amount | scope->subtotal_issues) {
line = chk_print(line, "summary: %" PRIuSIZE " records,", record_count);
if (dups || (tbl->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)))
line = chk_print(line, " %" PRIuSIZE " dups,", dups);
if (sub_databases || dbi == MAIN_DBI)
line = chk_print(line, " %" PRIuSIZE " tables,", sub_databases);
line = chk_print(line,
" %" PRIuSIZE " key's bytes,"
" %" PRIuSIZE " data's bytes,"
" %" PRIuSIZE " problem(s)",
tbl->histogram.key_len.amount, tbl->histogram.val_len.amount, scope->subtotal_issues);
} else {
line = chk_puts(line, "empty");
}
chk_line_end(chk_flush(line));
}
@@ -1405,7 +1467,7 @@ __cold static int chk_handle_gc(MDBX_chk_scope_t *const scope, MDBX_chk_table_t
iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) : pgno_sub(pgno, span));
++span)
;
histogram_acc(span, &tbl->histogram.nested_tree);
histogram_acc(span, &tbl->histogram.nested_height);
MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra);
if (line) {
if (span > 1)
@@ -1429,7 +1491,7 @@ __cold static int env_chk(MDBX_chk_scope_t *const scope) {
MDBX_chk_context_t *const usr = chk->usr;
MDBX_env *const env = usr->env;
MDBX_txn *const txn = usr->txn;
int err = env_info(env, txn, &chk->envinfo, sizeof(chk->envinfo), &chk->troika);
int err = env_info(env, txn, &chk->envinfo, &chk->troika);
if (unlikely(err))
return chk_error_rc(scope, err, "env_info");
@@ -1449,9 +1511,11 @@ __cold static int env_chk(MDBX_chk_scope_t *const scope) {
line = chk_puts(line, "is unavailable");
chk_line_end(line);
err = osal_filesize(env->lazy_fd, &env->dxb_mmap.filesize);
if (unlikely(err))
return chk_error_rc(scope, err, "osal_filesize");
line = chk_print_size(chk_line_begin(scope, MDBX_chk_verbose), "system unified page cache block ",
chk->envinfo.mi_sys_upcblk, "");
chk_line_end(line);
env->dxb_mmap.filesize = chk->envinfo.mi_dxb_fsize;
//--------------------------------------------------------------------------
@@ -1503,11 +1567,13 @@ __cold static int env_chk(MDBX_chk_scope_t *const scope) {
usr->result.backed_pages = (size_t)dxbfile_pages;
}
line = chk_line_feed(chk_print(chk_line_begin(inner, MDBX_chk_info),
"pagesize %u (%u system), max keysize %u..%u"
", max readers %u",
line = chk_line_feed(chk_print(chk_line_begin(inner, MDBX_chk_info), "pagesize %u (%u system), max keysize %u..%u",
env->ps, globals.sys_pagesize, mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT),
mdbx_env_get_maxkeysize_ex(env, MDBX_DB_DEFAULTS), env->max_readers));
mdbx_env_get_maxkeysize_ex(env, MDBX_DB_DEFAULTS)));
if ((env->flags & MDBX_EXCLUSIVE) == 0 && env->lck_mmap.lck) {
line = chk_line_feed(chk_print(chk_line_begin(inner, MDBX_chk_info), "currently %u readers of %u maximum",
atomic_load32(&env->lck_mmap.lck->rdt_length, mo_Relaxed), env->max_readers));
}
line = chk_line_feed(chk_print_size(line, "mapsize ", env->dxb_mmap.current, nullptr));
if (txn->geo.lower == txn->geo.upper)
line = chk_print_size(line, "fixed datafile: ", chk->envinfo.mi_geo.current, nullptr);
@@ -1530,6 +1596,19 @@ __cold static int env_chk(MDBX_chk_scope_t *const scope) {
chk_line_end(chk_print(line, " > until it will be closed or reopened in read-write mode."));
}
#endif /* Windows || Debug */
line = chk_print_size(chk_line_begin(scope, MDBX_chk_verbose), "filesystem: io-block ", chk->envinfo.mi_sys_ioblk,
", space allocated for the dxb-file ");
if (chk->envinfo.mi_dxb_fallocated == chk->envinfo.mi_geo.current) {
line = chk_puts(line, "exactly");
} else {
line = chk_print_size(
line, (chk->envinfo.mi_dxb_fallocated > chk->envinfo.mi_geo.current) ? "with excess " : "partially ",
chk->envinfo.mi_dxb_fallocated, " ");
ratio2digits_buffer_t buffer;
line =
chk_print(line, "%s%%", ratio2percent(chk->envinfo.mi_dxb_fallocated, chk->envinfo.mi_geo.current, &buffer));
}
chk_line_end(line);
chk_verbose_meta(inner, 0);
chk_verbose_meta(inner, 1);
chk_verbose_meta(inner, 2);
@@ -1628,7 +1707,7 @@ __cold static int env_chk(MDBX_chk_scope_t *const scope) {
err = chk_db(usr->scope, FREE_DBI, &chk->table_gc, chk_handle_gc);
line = chk_line_begin(scope, MDBX_chk_info);
if (line) {
histogram_print(scope, line, &chk->table_gc.histogram.nested_tree, "span(s)", "single", false);
histogram_print(scope, line, &chk->table_gc.histogram.nested_height, "span(s)", "single", false);
chk_line_end(line);
}
if (usr->result.problems_gc == 0 && (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) {
@@ -1646,69 +1725,50 @@ __cold static int env_chk(MDBX_chk_scope_t *const scope) {
//--------------------------------------------------------------------------
err = chk_scope_begin(chk, 1, MDBX_chk_space, nullptr, nullptr, "Page allocation:");
const double percent_boundary_reciprocal = 100.0 / txn->geo.upper;
const double percent_backed_reciprocal = 100.0 / usr->result.backed_pages;
const size_t backed = usr->result.backed_pages;
const size_t boundary = txn->geo.upper;
const size_t detained = usr->result.gc_pages - usr->result.reclaimable_pages;
const size_t available2boundary = txn->geo.upper - usr->result.alloc_pages + usr->result.reclaimable_pages;
const size_t available2backed = usr->result.backed_pages - usr->result.alloc_pages + usr->result.reclaimable_pages;
const size_t remained2boundary = txn->geo.upper - usr->result.alloc_pages;
const size_t remained2backed = usr->result.backed_pages - usr->result.alloc_pages;
const size_t available2boundary = boundary - usr->result.alloc_pages + usr->result.reclaimable_pages;
const size_t available2backed = backed - usr->result.alloc_pages + usr->result.reclaimable_pages;
const size_t remained2boundary = boundary - usr->result.alloc_pages;
const size_t remained2backed = backed - usr->result.alloc_pages;
const size_t used = (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) ? usr->result.alloc_pages - usr->result.gc_pages
: usr->result.processed_pages;
line = chk_line_begin(usr->scope, MDBX_chk_info);
line = chk_print(line,
"backed by file: %" PRIuSIZE " pages (%.1f%%)"
", %" PRIuSIZE " left to boundary (%.1f%%)",
usr->result.backed_pages, usr->result.backed_pages * percent_boundary_reciprocal,
txn->geo.upper - usr->result.backed_pages,
(txn->geo.upper - usr->result.backed_pages) * percent_boundary_reciprocal);
line = chk_print_pages_percent(line, "backed by file:\0 of boundary\0", backed, boundary);
line = chk_print_pages_percent(line, ",\0\0 left to boundary", boundary - backed, boundary);
line = chk_line_feed(line);
line = chk_print(line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", "used", used,
used * percent_backed_reciprocal, used * percent_boundary_reciprocal);
line = chk_print_pages_percent_bb(line, "used:", used, backed, boundary);
line = chk_line_feed(line);
line = chk_print(line, "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE " to boundary (%.1f%% of boundary)",
"remained", remained2backed, remained2backed * percent_backed_reciprocal, remained2boundary,
remained2boundary * percent_boundary_reciprocal);
line = chk_print_pages_percent(line, "remained:\0\0 of backed", remained2backed, backed);
line = chk_print_pages_percent(line, ", left\0\0 to boundary", remained2boundary, boundary);
line = chk_line_feed(line);
line =
chk_print(line,
"reclaimable: %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)"
", GC %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)",
usr->result.reclaimable_pages, usr->result.reclaimable_pages * percent_backed_reciprocal,
usr->result.reclaimable_pages * percent_boundary_reciprocal, usr->result.gc_pages,
usr->result.gc_pages * percent_backed_reciprocal, usr->result.gc_pages * percent_boundary_reciprocal);
line = chk_print_pages_percent_bb(line, "reclaimable:", usr->result.reclaimable_pages, backed, boundary);
line = chk_print_pages_percent_bb(line, ", within GC", usr->result.gc_pages, backed, boundary);
line = chk_line_feed(line);
line = chk_print(line,
"detained by reader(s): %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)"
", %u reader(s), lag %" PRIi64,
detained, detained * percent_backed_reciprocal, detained * percent_boundary_reciprocal,
chk->envinfo.mi_numreaders, chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid);
line = chk_print_pages_percent_bb(line, "detained by reader(s):", detained, backed, boundary);
line = chk_print(line, ", %u reader(s), lag %" PRIi64, chk->envinfo.mi_numreaders,
chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid);
line = chk_line_feed(line);
line = chk_print(line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", "allocated",
usr->result.alloc_pages, usr->result.alloc_pages * percent_backed_reciprocal,
usr->result.alloc_pages * percent_boundary_reciprocal);
line = chk_print_pages_percent_bb(line, "allocated:", usr->result.alloc_pages, backed, boundary);
line = chk_line_feed(line);
line = chk_print(line, "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE " to boundary (%.1f%% of boundary)",
"available", available2backed, available2backed * percent_backed_reciprocal, available2boundary,
available2boundary * percent_boundary_reciprocal);
line = chk_print_pages_percent(line, "available:\0 of backed\0", available2backed, backed);
line = chk_print_pages_percent(line, ", left\0\0 to boundary", available2boundary, boundary);
chk_line_end(line);
line = chk_line_begin(usr->scope, MDBX_chk_resolution);
line = chk_print(line, "%s %" PRIaPGNO " pages", (txn->geo.upper == txn->geo.now) ? "total" : "upto", txn->geo.upper);
line = chk_print(line, ", backed %" PRIuSIZE " (%.1f%%)", usr->result.backed_pages,
usr->result.backed_pages * percent_boundary_reciprocal);
line = chk_print(line, ", allocated %" PRIuSIZE " (%.1f%%)", usr->result.alloc_pages,
usr->result.alloc_pages * percent_boundary_reciprocal);
line = chk_print(line, ", available %" PRIuSIZE " (%.1f%%)", available2boundary,
available2boundary * percent_boundary_reciprocal);
line = chk_print(line, "%s %zu pages", (boundary == txn->geo.now) ? "total" : "upto", boundary);
line = chk_print_pages_percent(line, ", backed\0\0", backed, boundary);
line = chk_print_pages_percent(line, ", allocated\0\0", usr->result.alloc_pages, boundary);
line = chk_print_pages_percent(line, ", available\0\0", available2boundary, boundary);
chk_line_end(line);
chk_scope_restore(scope, err);

View File

@@ -200,15 +200,21 @@ static inline bool check_table_flags(unsigned flags) {
}
}
static inline int tbl_setup_ifneed(const MDBX_env *env, volatile kvx_t *const kvx, const tree_t *const db) {
MDBX_MAYBE_UNUSED static inline int tbl_setup_ifneed(const MDBX_env *env, volatile kvx_t *const kvx,
const tree_t *const db) {
return likely(kvx->clc.v.lmax) ? MDBX_SUCCESS : tbl_setup(env, kvx, db);
}
MDBX_MAYBE_UNUSED static inline int tbl_refresh_absent2baddbi(MDBX_txn *txn, size_t dbi) {
int rc = tbl_refresh(txn, dbi);
return likely(rc != MDBX_NOTFOUND) ? rc : MDBX_BAD_DBI;
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION static inline size_t pgno2bytes(const MDBX_env *env, size_t pgno) {
eASSERT(env, (1u << env->ps2ln) == env->ps);
return ((size_t)pgno) << env->ps2ln;
return pgno << env->ps2ln;
}
MDBX_NOTHROW_PURE_FUNCTION static inline page_t *pgno2page(const MDBX_env *env, size_t pgno) {
@@ -286,7 +292,8 @@ MDBX_NOTHROW_PURE_FUNCTION static inline const page_t *payload2page(const void *
return container_of(data, page_t, entries);
}
MDBX_NOTHROW_PURE_FUNCTION static inline const page_t *ptr2page(const MDBX_env *env, const void *ptr) {
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline const page_t *ptr2page(const MDBX_env *env,
const void *ptr) {
eASSERT(env,
ptr_dist(ptr, env->dxb_mmap.base) >= 0 && (size_t)ptr_dist(ptr, env->dxb_mmap.base) < env->dxb_mmap.limit);
const uintptr_t mask = env->ps - 1;
@@ -504,9 +511,9 @@ static inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) {
}
MDBX_NOTHROW_CONST_FUNCTION static inline txnid_t txn_basis_snapshot(const MDBX_txn *txn) {
STATIC_ASSERT((MDBX_TXN_RDONLY >> 17) == 1);
STATIC_ASSERT((xMDBX_TXNID_STEP >> (xMDBX_TXNID_STEP == 2)) == 1);
const txnid_t committed_txnid = txn->txnid + (xMDBX_TXNID_STEP >> (xMDBX_TXNID_STEP == 2)) - ((txn->flags >> 17) & 1);
STATIC_ASSERT(((MDBX_TXN_RDONLY >> ((xMDBX_TXNID_STEP == 2) ? 16 : 17)) & xMDBX_TXNID_STEP) == xMDBX_TXNID_STEP);
const txnid_t committed_txnid =
txn->txnid - xMDBX_TXNID_STEP + ((txn->flags >> ((xMDBX_TXNID_STEP == 2) ? 16 : 17)) & xMDBX_TXNID_STEP);
tASSERT(txn, committed_txnid == ((txn->flags & MDBX_TXN_RDONLY) ? txn->txnid : txn->txnid - xMDBX_TXNID_STEP));
return committed_txnid;
}

View File

@@ -63,6 +63,11 @@
#cmakedefine01 MDBX_USE_MINCORE
#cmakedefine MDBX_USE_FALLOCATE_AUTO
#ifndef MDBX_USE_FALLOCATE_AUTO
#cmakedefine01 MDBX_USE_FALLOCATE
#endif /* MDBX_USE_FALLOCATE */
/* Build Info */
#ifndef MDBX_BUILD_TIMESTAMP
#cmakedefine MDBX_BUILD_TIMESTAMP "@MDBX_BUILD_TIMESTAMP@"

View File

@@ -293,7 +293,7 @@ static __always_inline int couple_init(cursor_couple_t *couple, const MDBX_txn *
}
if (unlikely(*dbi_state & DBI_STALE))
return tbl_fetch(couple->outer.txn, cursor_dbi(&couple->outer));
return tbl_refresh_absent2baddbi(couple->outer.txn, cursor_dbi(&couple->outer));
return tbl_setup_ifneed(txn->env, kvx, tree);
}

View File

@@ -236,7 +236,7 @@ enum cursor_checking {
MDBX_INTERNAL int __must_check_result cursor_validate(const MDBX_cursor *mc);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline size_t cursor_dbi(const MDBX_cursor *mc) {
cASSERT(mc, mc->txn && mc->txn->signature == txn_signature);
cASSERT(mc, mc->txn->signature == txn_signature);
size_t dbi = mc->dbi_state - mc->txn->dbi_state;
cASSERT(mc, dbi < mc->txn->env->n_dbi);
return dbi;

247
src/dbi.c
View File

@@ -5,7 +5,7 @@
#if MDBX_ENABLE_DBI_SPARSE
size_t dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi) {
tASSERT(txn, bmi > 0);
tASSERT(txn, bmi != 0);
bmi &= -bmi;
if (sizeof(txn->dbi_sparse[0]) > 4) {
static const uint8_t debruijn_ctz64[64] = {0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
@@ -33,6 +33,23 @@ struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) {
return r;
}
int dbi_gone(MDBX_txn *txn, const size_t dbi, const int rc) {
tASSERT(txn, txn->n_dbi > dbi && F_ISSET(txn->dbi_state[dbi], DBI_LINDO | DBI_VALID));
for (;;) {
unsigned state = txn->dbi_state[dbi];
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO;
if (state & (DBI_FRESH | DBI_CREAT))
return rc;
if (!txn->parent)
break;
txn = txn->parent;
}
/* TODO: FIXME */
txn->dbi_seqs[dbi] = 0;
return rc;
}
__noinline int dbi_import(MDBX_txn *txn, const size_t dbi) {
const MDBX_env *const env = txn->env;
if (dbi >= env->n_dbi || !env->dbs_flags[dbi])
@@ -266,8 +283,8 @@ int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, MDBX_cmp_func
else {
if (txn->dbi_state[dbi] & DBI_STALE) {
eASSERT(env, env->dbs_flags[dbi] & DB_VALID);
int err = tbl_fetch(txn, dbi);
if (unlikely(err == MDBX_SUCCESS))
int err = tbl_refresh(txn, dbi);
if (unlikely(err != MDBX_NOTFOUND))
return err;
}
eASSERT(env, ((env->dbs_flags[dbi] ^ txn->dbs[dbi].flags) & DB_PERSISTENT_FLAGS) == 0);
@@ -325,8 +342,9 @@ static inline size_t dbi_namelen(const MDBX_val name) {
return (name.iov_len > sizeof(defer_free_item_t)) ? name.iov_len : sizeof(defer_free_item_t);
}
static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp, MDBX_val name) {
static int dbi_open_locked(MDBX_txn *txn, cursor_couple_t *maindb_cx, unsigned user_flags, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp, MDBX_val name, const size_t fastpath_slot) {
int rc;
MDBX_env *const env = txn->env;
/* Cannot mix named table(s) with DUPSORT flags */
@@ -352,12 +370,12 @@ static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, MD
env->kvs[MAIN_DBI].clc.v.cmp = builtin_datacmp(main_flags);
txn->dbs[MAIN_DBI].flags = main_flags;
txn->dbs[MAIN_DBI].dupfix_size = 0;
int err = tbl_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]);
if (unlikely(err != MDBX_SUCCESS)) {
rc = tbl_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]);
if (unlikely(rc != MDBX_SUCCESS)) {
txn->dbi_state[MAIN_DBI] = DBI_LINDO;
txn->flags |= MDBX_TXN_ERROR;
env->flags |= ENV_FATAL_ERROR;
return err;
return rc;
}
env->dbs_flags[MAIN_DBI] = main_flags | DB_VALID;
txn->dbi_seqs[MAIN_DBI] = atomic_store32(&env->dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease);
@@ -368,6 +386,7 @@ static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, MD
tASSERT(txn, env->kvs[MAIN_DBI].clc.k.cmp);
/* Is the DB already open? */
defer_free_item_t *clone = nullptr;
size_t slot = env->n_dbi;
for (size_t scan = CORE_DBS; scan < env->n_dbi; ++scan) {
if ((env->dbs_flags[scan] & DB_VALID) == 0) {
@@ -377,21 +396,49 @@ static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, MD
}
if (env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[scan].name) == 0) {
slot = scan;
int err = dbi_check(txn, slot);
if (err == MDBX_BAD_DBI && txn->dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) {
rc = dbi_check(txn, slot);
if (rc == MDBX_BAD_DBI && txn->dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) {
/* хендл использовался, стал невалидным,
* но теперь явно пере-открывается в этой транзакци */
* но теперь явно пере-открывается в этой транзакции */
eASSERT(env, !txn->cursors[slot]);
txn->dbi_state[slot] = DBI_LINDO;
err = dbi_check(txn, slot);
txn->dbi_seqs[slot] = 0;
rc = dbi_import(txn, slot);
/* TODO: FIXME */
}
if (err == MDBX_SUCCESS) {
err = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (likely(err == MDBX_SUCCESS)) {
goto done;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely((txn->dbi_state[slot] & DBI_STALE) == 0))
goto done;
if (fastpath_slot /* уже был выполнен поиск посредством tbl_fetch() */) {
if (slot != fastpath_slot)
txn->dbs[slot] = txn->dbs[fastpath_slot];
if (user_flags & MDBX_CREATE) {
/* значит таблица уже была открытой, но проверка её наличия в fastpath вернула MDBX_NOTFOUND */
rc = MDBX_NOTFOUND;
} else {
/* значит в fastpath был найден пустой слот и проверка наличия таблицы завершилась успешно */
assert(rc == MDBX_SUCCESS);
}
} else {
rc = tbl_fetch(txn, &maindb_cx->outer, slot, &name, user_flags);
}
return err;
if (likely(rc == MDBX_SUCCESS))
goto done;
if (rc == MDBX_NOTFOUND && (user_flags & MDBX_CREATE)) {
name = env->kvs[scan].name;
goto create;
}
return dbi_gone(txn, slot, rc);
}
}
@@ -409,88 +456,72 @@ static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, MD
env->n_dbi = (unsigned)slot + 1;
eASSERT(env, slot < env->n_dbi);
int err = dbi_check(txn, slot);
eASSERT(env, err == MDBX_BAD_DBI);
if (unlikely(err != MDBX_BAD_DBI))
rc = dbi_check(txn, slot);
eASSERT(env, rc == MDBX_BAD_DBI);
if (unlikely(rc != MDBX_BAD_DBI))
return MDBX_PROBLEM;
/* Find the DB info */
MDBX_val body;
cursor_couple_t cx;
int rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = cursor_seek(&cx.outer, &name, &body, MDBX_SET).err;
rc = tbl_fetch(txn, &maindb_cx->outer, slot, &name, user_flags);
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE))
return rc;
} else {
/* make sure this is actually a table */
node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
if (unlikely((node_flags(node) & (N_DUP | N_TREE)) != N_TREE))
return MDBX_INCOMPATIBLE;
if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(tree_t))) {
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid table node size", body.iov_len);
return MDBX_CORRUPTED;
}
memcpy(&txn->dbs[slot], body.iov_base, sizeof(tree_t));
}
/* Done here so we cannot fail after creating a new DB */
defer_free_item_t *const clone = osal_malloc(dbi_namelen(name));
clone = osal_malloc(dbi_namelen(name));
if (unlikely(!clone))
return MDBX_ENOMEM;
memcpy(clone, name.iov_base, name.iov_len);
name.iov_base = clone;
create:
tASSERT(txn, rc == MDBX_SUCCESS || rc == MDBX_NOTFOUND);
uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH;
if (unlikely(rc != MDBX_SUCCESS)) {
/* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */
tASSERT(txn, rc == MDBX_NOTFOUND);
body.iov_base = memset(&txn->dbs[slot], 0, body.iov_len = sizeof(tree_t));
txn->dbs[slot].root = P_INVALID;
txn->dbs[slot].mod_txnid = txn->txnid;
txn->dbs[slot].flags = user_flags & DB_PERSISTENT_FLAGS;
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
rc = cursor_put_checklen(&cx.outer, &name, &body, N_TREE | MDBX_NOOVERWRITE);
txn->cursors[MAIN_DBI] = cx.outer.next;
rc = tbl_create(txn, &maindb_cx->outer, slot, &name, user_flags);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
dbi_state |= DBI_DIRTY | DBI_CREAT;
txn->flags |= MDBX_TXN_DIRTY;
tASSERT(txn, (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) != 0);
}
/* Got info, register DBI in this txn */
const uint32_t seq = dbi_seq_next(env, slot);
eASSERT(env, env->dbs_flags[slot] == DB_POISON && !txn->cursors[slot] &&
(txn->dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO);
txn->dbi_state[slot] = dbi_state;
memcpy(&txn->dbs[slot], body.iov_base, sizeof(txn->dbs[slot]));
env->dbs_flags[slot] = txn->dbs[slot].flags;
rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
eASSERT(env, !txn->cursors[slot]);
if (clone) {
eASSERT(env, env->dbs_flags[slot] == DB_POISON && (txn->dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO);
txn->dbi_state[slot] = dbi_state;
env->dbs_flags[slot] = txn->dbs[slot].flags;
rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
env->kvs[slot].name = name;
env->dbs_flags[slot] = txn->dbs[slot].flags | DB_VALID;
txn->dbi_seqs[slot] = atomic_store32(&env->dbi_seqs[slot], seq, mo_AcquireRelease);
env->kvs[slot].name = name;
env->dbs_flags[slot] = txn->dbs[slot].flags | DB_VALID;
txn->dbi_seqs[slot] = atomic_store32(&env->dbi_seqs[slot], seq, mo_AcquireRelease);
} else {
eASSERT(env, env->dbs_flags[slot] == (DB_VALID | (user_flags & DB_PERSISTENT_FLAGS)) &&
env->dbs_flags[slot] == (DB_VALID | txn->dbs[slot].flags) &&
txn->dbi_state[slot] == (DBI_LINDO | DBI_VALID | DBI_STALE));
}
done:
*dbi = (MDBX_dbi)slot;
*(MDBX_dbi *)maindb_cx->userctx = (MDBX_dbi)slot;
tASSERT(txn, slot < txn->n_dbi && (env->dbs_flags[slot] & DB_VALID) != 0);
eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS);
return MDBX_SUCCESS;
bailout:
eASSERT(env, !txn->cursors[slot] && !env->kvs[slot].name.iov_len && !env->kvs[slot].name.iov_base);
txn->dbi_state[slot] &= DBI_LINDO | DBI_OLDEN;
env->dbs_flags[slot] = 0;
osal_free(clone);
if (slot + 1 == env->n_dbi)
txn->n_dbi = env->n_dbi = (unsigned)slot;
if (clone) {
eASSERT(env, !txn->cursors[slot] && !env->kvs[slot].name.iov_len && !env->kvs[slot].name.iov_base);
osal_free(clone);
if (slot + 1 == env->n_dbi)
txn->n_dbi = env->n_dbi = (unsigned)slot;
} else {
eASSERT(env, name.iov_base == env->kvs[slot].name.iov_base);
}
return rc;
}
@@ -528,18 +559,20 @@ int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags, MDB
if (unlikely(name->iov_len > txn->env->leaf_nodemax - NODESIZE - sizeof(tree_t)))
return MDBX_EINVAL;
cursor_couple_t cx;
size_t fastpath_slot = 0;
#if MDBX_ENABLE_DBI_LOCKFREE
/* Is the DB already open? */
const MDBX_env *const env = txn->env;
bool have_free_slot = env->n_dbi < env->max_dbi;
for (size_t i = CORE_DBS; i < env->n_dbi; ++i) {
if ((env->dbs_flags[i] & DB_VALID) == 0) {
have_free_slot = true;
size_t first_free_slot = env->n_dbi;
for (size_t slot = CORE_DBS; slot < env->n_dbi; ++slot) {
if ((env->dbs_flags[slot] & DB_VALID) == 0) {
first_free_slot = (first_free_slot < slot) ? first_free_slot : slot;
continue;
}
struct dbi_snap_result snap = dbi_snap(env, i);
const MDBX_val snap_name = env->kvs[i].name;
struct dbi_snap_result snap = dbi_snap(env, slot);
const MDBX_val snap_name = env->kvs[slot].name;
const uint32_t main_seq = atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease);
MDBX_cmp_func *const snap_cmp = env->kvs[MAIN_DBI].clc.k.cmp;
if (unlikely(!(snap.flags & DB_VALID) || !snap_name.iov_base || !snap_name.iov_len || !snap_cmp))
@@ -547,10 +580,10 @@ int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags, MDB
goto slowpath_locking;
const bool name_match = snap_cmp(&snap_name, name) == 0;
if (unlikely(snap.sequence != atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease) ||
if (unlikely(snap.sequence != atomic_load32(&env->dbi_seqs[slot], mo_AcquireRelease) ||
main_seq != atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease) ||
snap.flags != env->dbs_flags[i] || snap_name.iov_base != env->kvs[i].name.iov_base ||
snap_name.iov_len != env->kvs[i].name.iov_len))
snap.flags != env->dbs_flags[slot] || snap_name.iov_base != env->kvs[slot].name.iov_base ||
snap_name.iov_len != env->kvs[slot].name.iov_len))
/* похоже на столкновение с параллельно работающим обновлением */
goto slowpath_locking;
@@ -559,45 +592,65 @@ int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags, MDB
osal_flush_incoherent_cpu_writeback();
if (user_flags != MDBX_ACCEDE &&
(((user_flags ^ snap.flags) & DB_PERSISTENT_FLAGS) || (keycmp && keycmp != env->kvs[i].clc.k.cmp) ||
(datacmp && datacmp != env->kvs[i].clc.v.cmp)))
(((user_flags ^ snap.flags) & DB_PERSISTENT_FLAGS) || (keycmp && keycmp != env->kvs[slot].clc.k.cmp) ||
(datacmp && datacmp != env->kvs[slot].clc.v.cmp)))
/* есть подозрение что пользователь открывает таблицу с другими флагами/атрибутами
* или другими компараторами, поэтому уходим в безопасный режим */
goto slowpath_locking;
rc = dbi_check(txn, i);
if (rc == MDBX_BAD_DBI && txn->dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) {
rc = dbi_check(txn, slot);
if (rc == MDBX_BAD_DBI && txn->dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) {
/* хендл использовался, стал невалидным,
* но теперь явно пере-открывается в этой транзакци */
eASSERT(env, !txn->cursors[i]);
txn->dbi_state[i] = DBI_LINDO;
rc = dbi_check(txn, i);
* но теперь явно пере-открывается в этой транзакции */
goto slowpath_locking;
}
if (likely(rc == MDBX_SUCCESS)) {
if (unlikely(snap.sequence != atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease) ||
main_seq != atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease) ||
snap.flags != env->dbs_flags[i] || snap_name.iov_base != env->kvs[i].name.iov_base ||
snap_name.iov_len != env->kvs[i].name.iov_len))
/* похоже на столкновение с параллельно работающим обновлением */
goto slowpath_locking;
rc = dbi_bind(txn, i, user_flags, keycmp, datacmp);
if (likely(rc == MDBX_SUCCESS))
*dbi = (MDBX_dbi)i;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(snap.sequence != atomic_load32(&env->dbi_seqs[slot], mo_AcquireRelease) ||
main_seq != atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease) ||
snap.flags != env->dbs_flags[slot] || snap_name.iov_base != env->kvs[slot].name.iov_base ||
snap_name.iov_len != env->kvs[slot].name.iov_len))
/* похоже на столкновение с параллельно работающим обновлением */
goto slowpath_locking;
rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
tASSERT(txn, F_ISSET(txn->dbi_state[slot], DBI_LINDO | DBI_VALID));
if (txn->dbi_state[slot] & DBI_STALE) {
rc = tbl_fetch(txn, &cx.outer, fastpath_slot = slot, name, user_flags);
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_NOTFOUND && (user_flags & MDBX_CREATE))
/* таблицы уже нет, но запрошено её пересоздание */
goto slowpath_locking;
return dbi_gone(txn, slot, rc);
}
txn->dbi_state[slot] -= DBI_STALE;
}
return rc;
*dbi = (MDBX_dbi)slot;
return MDBX_SUCCESS;
}
/* Fail, if no free slot and max hit */
if (unlikely(!have_free_slot))
if (unlikely(first_free_slot >= env->max_dbi))
return MDBX_DBS_FULL;
slowpath_locking:
if (!(user_flags & MDBX_CREATE)) {
rc = tbl_fetch(txn, &cx.outer, fastpath_slot = first_free_slot, name, user_flags);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
slowpath_locking:
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
cx.userctx = dbi;
rc = osal_fastmutex_acquire(&txn->env->dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name);
rc = dbi_open_locked(txn, &cx, user_flags, keycmp, datacmp, *name, fastpath_slot);
ENSURE(txn->env, osal_fastmutex_release(&txn->env->dbi_lock) == MDBX_SUCCESS);
}
return rc;

View File

@@ -11,7 +11,7 @@ MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL size_t dbi_bitmap_ct
intptr_t bmi);
static inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) {
tASSERT(txn, bmi > 0);
tASSERT(txn, bmi != 0);
STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->dbi_sparse[0]));
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
if (sizeof(txn->dbi_sparse[0]) <= sizeof(int))
@@ -80,6 +80,7 @@ static inline bool dbi_foreach_step(const MDBX_txn *const txn, size_t *bitmap_it
#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS)
MDBX_INTERNAL int dbi_import(MDBX_txn *txn, const size_t dbi);
MDBX_INTERNAL int dbi_gone(MDBX_txn *txn, const size_t dbi, const int rc);
struct dbi_snap_result {
uint32_t sequence;

View File

@@ -532,7 +532,7 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit
if (unlikely(err != MDBX_SUCCESS))
return err;
err = osal_fallocate(env->lazy_fd, env->dxb_mmap.filesize = env->dxb_mmap.current = env->geo_in_bytes.now);
err = osal_fsetsize(env->lazy_fd, env->dxb_mmap.filesize = env->dxb_mmap.current = env->geo_in_bytes.now);
if (unlikely(err != MDBX_SUCCESS))
return err;
@@ -570,7 +570,7 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit
size_t expected_filesize = 0;
const size_t used_bytes = pgno2bytes(env, header.geometry.first_unallocated);
const size_t used_aligned2os_bytes = ceil_powerof2(used_bytes, globals.sys_pagesize);
const size_t used_aligned2os_bytes = ceil_powerof2(used_bytes, globals.sys_allocation_granularity);
if ((env->flags & MDBX_RDONLY) /* readonly */
|| lck_rc != MDBX_RESULT_TRUE /* not exclusive */
|| /* recovery mode */ env->stuck_meta >= 0) {
@@ -639,7 +639,7 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit
env->geo_in_bytes.shrink = pgno_ceil2sp_bytes(env, pv2pages(header.geometry.shrink_pv));
}
ENSURE(env, pgno_ceil2sp_bytes(env, header.geometry.now) == env->geo_in_bytes.now);
ENSURE(env, pgno_ceil2ag_bytes(env, header.geometry.now) == env->geo_in_bytes.now);
ENSURE(env, env->geo_in_bytes.now >= used_bytes);
if (!expected_filesize)
expected_filesize = env->geo_in_bytes.now;
@@ -843,13 +843,13 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit
if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) {
//-------------------------------------------------- shrink DB & update geo
/* re-check size after mmap */
if ((env->dxb_mmap.current & (globals.sys_pagesize - 1)) != 0 || env->dxb_mmap.current < used_bytes) {
if (floor_powerof2(env->dxb_mmap.current, globals.sys_pagesize) < used_bytes) {
ERROR("unacceptable/unexpected datafile size %" PRIuPTR, env->dxb_mmap.current);
return MDBX_PROBLEM;
}
if (env->dxb_mmap.current != env->geo_in_bytes.now) {
header.geometry.now = bytes2pgno(env, env->dxb_mmap.current);
NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO " pages", env->dxb_mmap.current,
header.geometry.now = bytes2pgno(env, env->geo_in_bytes.now);
NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, aligned %" PRIaPGNO " pages", env->geo_in_bytes.now,
header.geometry.now);
}

View File

@@ -305,6 +305,12 @@ __cold int env_open(MDBX_env *env, mdbx_mode_t mode) {
env->fd4meta = env->lazy_fd;
#if defined(_WIN32) || defined(_WIN64)
env->dxb_lock_event = CreateEventW(nullptr, true, false, nullptr);
if (unlikely(!env->dxb_lock_event))
return (int)GetLastError();
env->lck_lock_event = CreateEventW(nullptr, true, false, nullptr);
if (unlikely(!env->lck_lock_event))
return (int)GetLastError();
eASSERT(env, env->ioring.overlapped_fd == 0);
bool ior_direct = false;
if (!(env->flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) {
@@ -346,9 +352,6 @@ __cold int env_open(MDBX_env *env, mdbx_mode_t mode) {
&env->ioring.overlapped_fd, 0);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
env->dxb_lock_event = CreateEventW(nullptr, true, false, nullptr);
if (unlikely(!env->dxb_lock_event))
return (int)GetLastError();
osal_fseek(env->ioring.overlapped_fd, safe_parking_lot_offset);
}
#else
@@ -545,6 +548,10 @@ __cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
CloseHandle(env->dxb_lock_event);
env->dxb_lock_event = INVALID_HANDLE_VALUE;
}
if (env->lck_lock_event != INVALID_HANDLE_VALUE) {
CloseHandle(env->lck_lock_event);
env->lck_lock_event = INVALID_HANDLE_VALUE;
}
eASSERT(env, !resurrect_after_fork);
if (env->pathname_char) {
osal_free(env->pathname_char);

View File

@@ -65,6 +65,9 @@ union logger_union {
struct libmdbx_globals {
bin128_t bootid;
unsigned sys_pagesize, sys_allocation_granularity;
#ifdef AT_UCACHEBSIZE
unsigned sys_unified_cache_block;
#endif /* AT_UCACHEBSIZE */
uint8_t sys_pagesize_ln2;
uint8_t runtime_flags;
uint8_t loglevel;

View File

@@ -878,7 +878,7 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
//---------------------------------------------------------------------------
if (unlikely(!is_reclaimable(txn, mc, flags))) {
eASSERT(env, (txn->flags & txn_gc_drained) || num > 1);
eASSERT(env, (txn->flags & txn_gc_drained) || num > 1 || mc->tree == &txn->dbs[FREE_DBI]);
goto no_gc;
}
@@ -1090,7 +1090,7 @@ next_gc:
rkl_t *rkl = &txn->wr.gc.reclaimed;
const char *rkl_name = "reclaimed";
if (mc->dbi_state != txn->dbi_state &&
(MDBX_DEBUG || pnl_size(txn->wr.repnl) > (size_t)gc->tree->height + gc->tree->height + 3)) {
(MDBX_DEBUG > 0 || pnl_size(txn->wr.repnl) > (size_t)gc->tree->height + gc->tree->height + 3)) {
gc->next = txn->cursors[FREE_DBI];
txn->cursors[FREE_DBI] = gc;
ret.err = cursor_del(gc, 0);

View File

@@ -1290,9 +1290,12 @@ static int gc_fill_returned(MDBX_txn *txn, gcu_t *ctx) {
rkl_iter_t iter = rkl_iterator(&txn->wr.gc.comeback, is_lifo(txn));
size_t surplus = ctx->return_reserved_hi - amount, stored = 0;
const uint64_t factor = ((uint64_t)surplus << 32) / ctx->return_reserved_hi;
TRACE("%s: amount %zu, slots %zu, surplus %zu (%zu..%zu), factor %.6f (%" PRIu64 " >> 32, sharp %.12f)",
dbg_prefix(ctx), amount, slots, surplus, ctx->return_reserved_lo, ctx->return_reserved_hi,
factor / (double)UINT32_MAX, factor, surplus / (double)ctx->return_reserved_hi);
ratio2digits_buffer_t factor_rough, factor_sharp;
TRACE("%s: amount %zu, slots %zu, surplus %zu (%zu..%zu), factor %s (%" PRIu64 " >> 32, sharp %s)", dbg_prefix(ctx),
amount, slots, surplus, ctx->return_reserved_lo, ctx->return_reserved_hi,
ratio2digits(factor, UINT32_MAX, &factor_rough, 6), factor,
ratio2digits(surplus, ctx->return_reserved_hi, &factor_sharp, 12));
do {
const size_t left = amount - stored;
tASSERT(txn, left > 0 && left <= amount);

View File

@@ -27,8 +27,7 @@ BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
#if !MDBX_MANUAL_MODULE_HANDLER
static
#endif /* !MDBX_MANUAL_MODULE_HANDLER */
void NTAPI
mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
void NTAPI mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
#endif /* MDBX_BUILD_SHARED_LIBRARY */
{
(void)reserved;
@@ -176,8 +175,29 @@ __cold static __attribute__((__destructor__)) void mdbx_global_destructor(void)
struct libmdbx_globals globals;
static bool getenv_bool(const char *name, bool default_value) {
const char *value = osal_getenv(name, false);
if (value) {
if (*value == 0 /* implied ON */)
return true;
if (strcasecmp(value, "yes") == 0 || strcasecmp(value, "on") == 0 || strcasecmp(value, "true") == 0 ||
strcasecmp(value, "1") == 0)
return true;
if (strcasecmp(value, "no") == 0 || strcasecmp(value, "off") == 0 || strcasecmp(value, "false") == 0 ||
strcasecmp(value, "0") == 0)
return false;
}
return default_value;
}
__cold static void mdbx_init(void) {
globals.runtime_flags = ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT;
globals.runtime_flags = (getenv_bool("MDBX_DBG_ASSERT", (MDBX_DEBUG) > 0) ? MDBX_DBG_ASSERT : 0) |
(getenv_bool("MDBX_DBG_AUDIT", (MDBX_DEBUG) > 1) ? MDBX_DBG_AUDIT : 0) |
(getenv_bool("MDBX_DBG_JITTER", false) ? MDBX_DBG_JITTER : 0) |
(getenv_bool("MDBX_DBG_DUMP", false) ? MDBX_DBG_DUMP : 0) |
(getenv_bool("MDBX_DBG_LEGACY_MULTIOPEN", false) ? MDBX_DBG_LEGACY_MULTIOPEN : 0) |
(getenv_bool("MDBX_DBG_LEGACY_OVERLAP", false) ? MDBX_DBG_LEGACY_OVERLAP : 0) |
(getenv_bool("MDBX_DBG_DONT_UPGRADE", false) ? MDBX_DBG_DONT_UPGRADE : 0);
globals.loglevel = MDBX_LOG_FATAL;
ENSURE(nullptr, osal_fastmutex_init(&globals.debug_lock) == 0);
osal_ctor();
@@ -379,6 +399,7 @@ __dll_export
#else /* Windows */
" MDBX_LOCKING=" MDBX_LOCKING_CONFIG
" MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG
" MDBX_USE_FALLOCATE=" MDBX_USE_FALLOCATE_CONFIG
#endif /* !Windows */
" MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE)
" MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT)

View File

@@ -142,10 +142,10 @@ struct kvx {
/* Non-shared DBI state flags inside transaction */
enum dbi_state {
DBI_DIRTY = 0x01 /* DB was written in this txn */,
DBI_STALE = 0x02 /* Named-DB record is older than txnID */,
DBI_FRESH = 0x04 /* Named-DB handle opened in this txn */,
DBI_CREAT = 0x08 /* Named-DB handle created in this txn */,
DBI_DIRTY = 0x01 /* table was written in this txn */,
DBI_STALE = 0x02 /* cached table record is outdated and should be reloaded/refreshed */,
DBI_FRESH = 0x04 /* table handle opened in this txn */,
DBI_CREAT = 0x08 /* table handle created in this txn */,
DBI_VALID = 0x10 /* Handle is valid, see also DB_VALID */,
DBI_OLDEN = 0x40 /* Handle was closed/reopened outside txn */,
DBI_LINDO = 0x80 /* Lazy initialization done for DBI-slot */,
@@ -352,6 +352,7 @@ struct MDBX_env {
mdbx_filehandle_t dsync_fd, fd4meta;
#if defined(_WIN32) || defined(_WIN64)
HANDLE dxb_lock_event;
HANDLE lck_lock_event;
#endif /* Windows */
osal_mmap_t lck_mmap; /* The lock file */
lck_t *lck;
@@ -481,7 +482,8 @@ struct MDBX_env {
#if defined(_WIN32) || defined(_WIN64)
osal_srwlock_t remap_guard;
/* Workaround for LockFileEx and WriteFile multithread bug */
CRITICAL_SECTION windowsbug_lock;
CRITICAL_SECTION lck_event_cs;
CRITICAL_SECTION dxb_event_cs;
char *pathname_char; /* cache of multi-byte representation of pathname
to the DB files */
#else

View File

@@ -16,8 +16,11 @@
#define LCK_WAITFOR 0
#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY
static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags, size_t offset, size_t bytes) {
TRACE("lock>>: fd %p, event %p, flags 0x%x offset %zu, bytes %zu >>", fd, event, flags, offset, bytes);
static int flock_ex(HANDLE fd, HANDLE event, unsigned flags, size_t offset, size_t bytes, unsigned timeout_ms) {
TRACE("lock>>: fd %p, timeout %u ms, event %p, flags 0x%x offset %zu, bytes %zu >>", fd, timeout_ms, event, flags,
offset, bytes);
assert(timeout_ms == 0 || (event && event != INVALID_HANDLE_VALUE));
assert(timeout_ms == 0 || (flags & LCK_DONTWAIT) == 0);
OVERLAPPED ov;
ov.Internal = 0;
ov.InternalHigh = 0;
@@ -25,43 +28,43 @@ static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags, size_t offs
ov.Offset = (DWORD)offset;
ov.OffsetHigh = HIGH_DWORD(offset);
int retry_left = (flags & LOCKFILE_FAIL_IMMEDIATELY) ? 3 : 0;
while (true) {
if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) {
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, event, flags, offset, bytes, "done");
if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) {
TRACE("lock<<: fd %p, timeout %u ms, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, timeout_ms, event,
flags, offset, bytes, "done");
return MDBX_SUCCESS;
}
DWORD rc = GetLastError();
if (rc == ERROR_IO_PENDING) {
if (timeout_ms) {
rc = osal_waitstatus2errcode(WaitForSingleObject(event, timeout_ms));
if (rc != MDBX_SUCCESS) {
if (rc == ERROR_TIMEOUT)
rc = ERROR_LOCK_VIOLATION;
goto bailout;
}
}
if (GetOverlappedResult(fd, &ov, &rc, true)) {
TRACE("lock<<: fd %p, timeout %u ms, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, timeout_ms, event,
flags, offset, bytes, "overlapped-done");
return MDBX_SUCCESS;
}
DWORD rc = GetLastError();
if (rc == ERROR_IO_PENDING) {
if (event) {
if (GetOverlappedResult(fd, &ov, &rc, true)) {
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, event, flags, offset, bytes,
"overlapped-done");
return MDBX_SUCCESS;
}
rc = GetLastError();
} else
CancelIo(fd);
}
if (rc != ERROR_LOCK_VIOLATION || --retry_left < 1) {
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << err %d", fd, event, flags, offset, bytes,
(int)rc);
return (int)rc;
}
SleepEx(0, true);
bailout:
CancelIo(fd);
}
TRACE("lock<<: fd %p, timeout %u ms, event %p, flags 0x%x offset %zu, bytes %zu << err %d", fd, timeout_ms, event,
flags, offset, bytes, (int)rc);
return (int)rc;
}
static inline int flock(HANDLE fd, unsigned flags, size_t offset, size_t bytes) {
return flock_with_event(fd, 0, flags, offset, bytes);
static int flock_lck(const MDBX_env *env, unsigned flags, size_t offset, size_t bytes, unsigned timeout_ms) {
return flock_ex(env->lck_mmap.fd, env->lck_lock_event, flags, offset, bytes, timeout_ms);
}
static inline int flock_data(const MDBX_env *env, unsigned flags, size_t offset, size_t bytes) {
static int flock_dxb(const MDBX_env *env, unsigned flags, size_t offset, size_t bytes) {
const HANDLE fd4data = env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
return flock_with_event(fd4data, env->dxb_lock_event, flags, offset, bytes);
return flock_ex(fd4data, env->dxb_lock_event, flags, offset, bytes, 0);
}
static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) {
@@ -84,11 +87,11 @@ static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) {
int lck_txn_lock(MDBX_env *env, bool dontwait) {
if (dontwait) {
if (!TryEnterCriticalSection(&env->windowsbug_lock))
if (!TryEnterCriticalSection(&env->dxb_event_cs))
return MDBX_BUSY;
} else {
__try {
EnterCriticalSection(&env->windowsbug_lock);
EnterCriticalSection(&env->dxb_event_cs);
} __except ((GetExceptionCode() == 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
? EXCEPTION_EXECUTE_HANDLER
: EXCEPTION_CONTINUE_SEARCH) {
@@ -100,34 +103,32 @@ int lck_txn_lock(MDBX_env *env, bool dontwait) {
if (env->flags & MDBX_EXCLUSIVE)
goto done;
const HANDLE fd4data = env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
int rc = flock_with_event(fd4data, env->dxb_lock_event,
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) : (LCK_EXCLUSIVE | LCK_WAITFOR), DXB_BODY);
int rc = flock_dxb(env, dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) : (LCK_EXCLUSIVE | LCK_WAITFOR), DXB_BODY);
if (rc == MDBX_SUCCESS) {
done:
if (env->basal_txn)
env->basal_txn->owner = osal_thread_self();
/* Zap: Failing to release lock 'env->windowsbug_lock'
/* Zap: Failing to release lock 'env->dxb_event_cs'
* in function 'mdbx_txn_lock' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115);
return MDBX_SUCCESS;
}
LeaveCriticalSection(&env->windowsbug_lock);
LeaveCriticalSection(&env->dxb_event_cs);
return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
}
void lck_txn_unlock(MDBX_env *env) {
eASSERT(env, !env->basal_txn || env->basal_txn->owner == osal_thread_self());
if ((env->flags & MDBX_EXCLUSIVE) == 0) {
const HANDLE fd4data = env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
int err = funlock(fd4data, DXB_BODY);
int err = funlock(env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd, DXB_BODY);
if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, err);
}
if (env->basal_txn)
env->basal_txn->owner = 0;
LeaveCriticalSection(&env->windowsbug_lock);
LeaveCriticalSection(&env->dxb_event_cs);
}
/*----------------------------------------------------------------------------*/
@@ -142,19 +143,34 @@ void lck_txn_unlock(MDBX_env *env) {
#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
int lck_rdt_lock(MDBX_env *env) {
int rc;
imports.srwl_AcquireShared(&env->remap_guard);
__try {
EnterCriticalSection(&env->lck_event_cs);
} __except ((GetExceptionCode() == 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
? EXCEPTION_EXECUTE_HANDLER
: EXCEPTION_CONTINUE_SEARCH) {
rc = MDBX_EDEADLK;
goto bailout;
}
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE)
return MDBX_SUCCESS; /* readonly database in readonly filesystem */
goto done; /* readonly database in readonly filesystem */
/* transition from S-? (used) to S-E (locked),
* e.g. exclusive lock upper-part */
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS;
goto done;
int rc = flock(env->lck_mmap.fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
if (rc == MDBX_SUCCESS)
rc = flock_lck(env, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER, 0);
if (rc == MDBX_SUCCESS) {
done:
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115);
return MDBX_SUCCESS;
}
bailout:
imports.srwl_ReleaseShared(&env->remap_guard);
return rc;
}
@@ -166,11 +182,12 @@ void lck_rdt_unlock(MDBX_env *env) {
if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, err);
}
LeaveCriticalSection(&env->lck_event_cs);
imports.srwl_ReleaseShared(&env->remap_guard);
}
int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
return flock(fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0, DXB_MAXLEN);
return flock_ex(fd, 0, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0, DXB_MAXLEN, 0);
}
static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) {
@@ -359,15 +376,18 @@ static void lck_unlock(MDBX_env *env) {
}
}
#define TIMEOUT_SHORT_MS 121
#define TIMEOUT_LONG_MS 900000 /* 15 min */
/* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE)
* or as 'used' (S-? and returns MDBX_RESULT_FALSE).
* Otherwise returns an error. */
static int internal_seize_lck(HANDLE lfd) {
assert(lfd != INVALID_HANDLE_VALUE);
static int internal_seize_lck(MDBX_env *env) {
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
/* 1) now on ?-? (free), get ?-E (middle) */
jitter4testing(false);
int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
int rc = flock_lck(env, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER, TIMEOUT_LONG_MS);
if (rc != MDBX_SUCCESS) {
/* 2) something went wrong, give up */;
ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc);
@@ -376,7 +396,7 @@ static int internal_seize_lck(HANDLE lfd) {
/* 3) now on ?-E (middle), try E-E (exclusive-write) */
jitter4testing(false);
rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER);
rc = flock_lck(env, LCK_EXCLUSIVE, LCK_LOWER, TIMEOUT_SHORT_MS);
if (rc == MDBX_SUCCESS)
return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */;
@@ -384,7 +404,7 @@ static int internal_seize_lck(HANDLE lfd) {
jitter4testing(false);
if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) {
/* 6) something went wrong, give up */
rc = funlock(lfd, LCK_UPPER);
rc = funlock(env->lck_mmap.fd, LCK_UPPER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", rc);
return rc;
@@ -392,7 +412,7 @@ static int internal_seize_lck(HANDLE lfd) {
/* 7) still on ?-E (middle), try S-E (locked) */
jitter4testing(false);
rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
rc = flock_lck(env, LCK_SHARED, LCK_LOWER, TIMEOUT_LONG_MS);
jitter4testing(false);
if (rc != MDBX_SUCCESS)
@@ -400,7 +420,7 @@ static int internal_seize_lck(HANDLE lfd) {
/* 8) now on S-E (locked) or still on ?-E (middle),
* transition to S-? (used) or ?-? (free) */
int err = funlock(lfd, LCK_UPPER);
int err = funlock(env->lck_mmap.fd, LCK_UPPER);
if (err != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "X-E(locked/middle) >> X-?(used/free)", err);
@@ -409,8 +429,6 @@ static int internal_seize_lck(HANDLE lfd) {
}
int lck_seize(MDBX_env *env) {
const HANDLE fd4data = env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
assert(fd4data != INVALID_HANDLE_VALUE);
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_RESULT_TRUE /* nope since files were must be opened
non-shareable */
@@ -419,13 +437,13 @@ int lck_seize(MDBX_env *env) {
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. on read-only filesystem) */
jitter4testing(false);
int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE);
int rc = flock_dxb(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE);
if (rc != MDBX_SUCCESS)
ERROR("%s, err %u", "without-lck", rc);
return rc;
}
int rc = internal_seize_lck(env->lck_mmap.fd);
int rc = internal_seize_lck(env);
jitter4testing(false);
if (rc == MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) {
/* Check that another process don't operates in without-lck mode.
@@ -434,7 +452,7 @@ int lck_seize(MDBX_env *env) {
* - we need an exclusive lock for do so;
* - we can't lock meta-pages, otherwise other process could get an error
* while opening db in valid (non-conflict) mode. */
int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE);
int err = flock_dxb(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE);
if (err != MDBX_SUCCESS) {
ERROR("%s, err %u", "lock-against-without-lck", err);
jitter4testing(false);
@@ -442,7 +460,7 @@ int lck_seize(MDBX_env *env) {
return err;
}
jitter4testing(false);
err = funlock(fd4data, DXB_WHOLE);
err = funlock(env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd, DXB_WHOLE);
if (err != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "unlock-against-without-lck", err);
}
@@ -451,9 +469,7 @@ int lck_seize(MDBX_env *env) {
}
int lck_downgrade(MDBX_env *env) {
const HANDLE fd4data = env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
/* Transite from exclusive-write state (E-E) to used (S-?) */
assert(fd4data != INVALID_HANDLE_VALUE);
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
if (env->flags & MDBX_EXCLUSIVE)
@@ -465,7 +481,7 @@ int lck_downgrade(MDBX_env *env) {
mdbx_panic("%s(%s) failed: err %u", __func__, "E-E(exclusive-write) >> ?-E(middle)", rc);
/* 2) now at ?-E (middle), transition to S-E (locked) */
rc = flock(env->lck_mmap.fd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
rc = flock_lck(env, LCK_SHARED, LCK_LOWER, TIMEOUT_LONG_MS);
if (rc != MDBX_SUCCESS) {
/* 3) something went wrong, give up */;
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
@@ -490,7 +506,7 @@ int lck_upgrade(MDBX_env *env, bool dont_wait) {
/* 1) now on S-? (used), try S-E (locked) */
jitter4testing(false);
int rc = flock(env->lck_mmap.fd, dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, LCK_UPPER);
int rc = flock_lck(env, dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, LCK_UPPER, 0);
if (rc != MDBX_SUCCESS) {
/* 2) something went wrong, give up */;
VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc);
@@ -504,7 +520,7 @@ int lck_upgrade(MDBX_env *env, bool dont_wait) {
/* 4) now on ?-E (middle), try E-E (exclusive-write) */
jitter4testing(false);
rc = flock(env->lck_mmap.fd, dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, LCK_LOWER);
rc = flock_lck(env, dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, LCK_LOWER, 0);
if (rc != MDBX_SUCCESS) {
/* 5) something went wrong, give up */;
VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc);
@@ -547,7 +563,7 @@ int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor, const uint32_t curr
if (synced && !inprocess_neighbor && env->lck_mmap.fd != INVALID_HANDLE_VALUE &&
lck_upgrade(env, true) == MDBX_SUCCESS)
/* this will fail if LCK is used/mmapped by other process(es) */
osal_ftruncate(env->lck_mmap.fd, 0);
osal_fsetsize(env->lck_mmap.fd, 0);
}
lck_unlock(env);
return MDBX_SUCCESS;

View File

@@ -86,7 +86,10 @@ MDBX_INTERNAL void debug_log_va(int level, const char *function, int line, const
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, unsigned line);
#if !((defined(_WIN32) || defined(_WIN64)) && defined(_DEBUG) && !MDBX_WITHOUT_MSVC_CRT)
MDBX_NORETURN
#endif
__cold void assert_fail(const char *msg, const char *func, unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \

View File

@@ -189,7 +189,7 @@
/** Avoid dependence from MSVC CRT and use ntdll.dll instead. */
#ifndef MDBX_WITHOUT_MSVC_CRT
#if defined(MDBX_BUILD_CXX) && !MDBX_BUILD_CXX
#if defined(MDBX_BUILD_CXX) && !MDBX_BUILD_CXX && (defined(_WIN32) || defined(_WIN64))
#define MDBX_WITHOUT_MSVC_CRT 1
#else
#define MDBX_WITHOUT_MSVC_CRT 0
@@ -303,7 +303,8 @@
((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) && _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) || \
(defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) && _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) || \
defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) && \
(!defined(__GLIBC__) || __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */)
(!defined(__GLIBC__) || __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */) && \
!defined(__OHOS__) /* Harmony OS doesn't support robust mutexes at the end of 2025 */
#define MDBX_LOCKING MDBX_LOCKING_POSIX2008
#else
#define MDBX_LOCKING MDBX_LOCKING_POSIX2001
@@ -358,6 +359,22 @@
#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1
#endif /* MDBX_USE_COPYFILERANGE */
/** Advanced: Using posix_fallocate() or fcntl(F_PREALLOCATE) on OSX (autodetection by default). */
#ifndef MDBX_USE_FALLOCATE
#if defined(__APPLE__)
#define MDBX_USE_FALLOCATE 0 /* Too slow and unclean, but not required to prevent SIGBUS */
#elif (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) || (__GLIBC_PREREQ(2, 10) && defined(_GNU_SOURCE))
#define MDBX_USE_FALLOCATE 1
#else
#define MDBX_USE_FALLOCATE 0
#endif
#define MDBX_USE_FALLOCATE_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_FALLOCATE)
#elif !(MDBX_USE_FALLOCATE == 0 || MDBX_USE_FALLOCATE == 1)
#error MDBX_USE_FALLOCATE must be defined as 0 or 1
#else
#define MDBX_USE_FALLOCATE_CONFIG MDBX_STRINGIFY(MDBX_USE_FALLOCATE)
#endif /* MDBX_USE_FALLOCATE */
//------------------------------------------------------------------------------
#ifndef MDBX_CPU_WRITEBACK_INCOHERENT

View File

@@ -14,23 +14,6 @@
#include <crtdbg.h>
#endif
static int waitstatus2errcode(DWORD result) {
switch (result) {
case WAIT_OBJECT_0:
return MDBX_SUCCESS;
case WAIT_FAILED:
return (int)GetLastError();
case WAIT_ABANDONED:
return ERROR_ABANDONED_WAIT_0;
case WAIT_IO_COMPLETION:
return ERROR_USER_APC;
case WAIT_TIMEOUT:
return ERROR_TIMEOUT;
default:
return ERROR_UNHANDLED_ERROR;
}
}
/* Map a result from an NTAPI call to WIN32 error code. */
static int ntstatus2errcode(NTSTATUS status) {
DWORD dummy;
@@ -205,7 +188,7 @@ __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *f
assert_fail(msg, func, line);
}
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, unsigned line) {
__cold void assert_fail(const char *msg, const char *func, unsigned line) {
#endif /* MDBX_DEBUG */
if (globals.logger.ptr)
@@ -224,13 +207,19 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, unsigne
while (1) {
#if defined(_WIN32) || defined(_WIN64)
#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG)
_CrtDbgReport(_CRT_ASSERT, func ? func : "unknown", line, "libmdbx", "assertion failed: %s", msg);
#if defined(_DEBUG) && !MDBX_WITHOUT_MSVC_CRT
if (_CrtDbgReport(_CRT_ASSERT, func ? func : "unknown", line, "libmdbx", "assertion failed: %s", msg) == 0)
return /* user chooses the "Continue" button */;
else {
/* user chooses the "Retry" button */
if (IsDebuggerPresent())
DebugBreak();
}
#else
if (IsDebuggerPresent())
DebugBreak();
#endif
FatalExit(STATUS_ASSERTION_FAILURE);
#endif
#else
abort();
#endif
@@ -252,14 +241,14 @@ __cold void mdbx_panic(const char *fmt, ...) {
while (1) {
#if defined(_WIN32) || defined(_WIN64)
#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG)
#if defined(_DEBUG) && !MDBX_WITHOUT_MSVC_CRT
_CrtDbgReport(_CRT_ASSERT, "mdbx.c", 0, "libmdbx", "panic: %s", const_message);
#else
OutputDebugStringA("\r\nMDBX-PANIC: ");
OutputDebugStringA(const_message);
#endif
if (IsDebuggerPresent())
DebugBreak();
#endif
FatalExit(ERROR_UNHANDLED_ERROR);
#else
__assert_fail(const_message, "mdbx-panic", 0, const_message);
@@ -414,7 +403,7 @@ int osal_condpair_destroy(osal_condpair_t *condpair) {
int osal_condpair_lock(osal_condpair_t *condpair) {
#if defined(_WIN32) || defined(_WIN64)
DWORD code = WaitForSingleObject(condpair->mutex, INFINITE);
return waitstatus2errcode(code);
return osal_waitstatus2errcode(code);
#else
return osal_pthread_mutex_lock(&condpair->mutex);
#endif
@@ -444,7 +433,7 @@ int osal_condpair_wait(osal_condpair_t *condpair, bool part) {
if (code == WAIT_OBJECT_0)
return MDBX_SUCCESS;
}
return waitstatus2errcode(code);
return osal_waitstatus2errcode(code);
#else
return pthread_cond_wait(&condpair->cond[part], &condpair->mutex);
#endif
@@ -1186,7 +1175,7 @@ int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env,
case MDBX_OPEN_LCK:
CreationDisposition = OPEN_ALWAYS;
DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY;
FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_OVERLAPPED;
break;
case MDBX_OPEN_DXB_READ:
CreationDisposition = OPEN_EXISTING;
@@ -1594,8 +1583,7 @@ int osal_is_pipe(mdbx_filehandle_t fd) {
#endif
}
/* truncate file: just set the length of a file */
int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) {
int osal_fsetsize(mdbx_filehandle_t fd, const uint64_t length) {
#if defined(_WIN32) || defined(_WIN64)
if (imports.SetFileInformationByHandle) {
FILE_END_OF_FILE_INFO EndOfFileInfo;
@@ -1610,31 +1598,58 @@ int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) {
}
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems");
return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno;
#endif
}
/* extend file: set the length of a file AND ensure the space has been allocated */
int osal_fallocate(mdbx_filehandle_t fd, uint64_t length) {
assert(length > 0);
int err = MDBX_RESULT_TRUE;
#if (defined(__linux__) || defined(__gnu_linux__)) && \
((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 10)) || (defined(__ANDROID_API__) && __ANDROID_API__ >= 21))
err = fallocate(fd, 0, 0, length) ? ignore_enosys_and_eremote(errno) : MDBX_SUCCESS;
#elif defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L && !defined(__APPLE__)
err = posix_fallocate(fd, 0, length) ? ignore_enosys_and_eremote(errno) : MDBX_SUCCESS;
#elif defined(__APPLE__)
fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, 0, length, 0};
if (fcntl(fd, F_PREALLOCATE, &store))
err = ignore_enosys_and_eremote(errno);
#endif /* Apple */
#if !defined(_WIN32) && !defined(_WIN64)
/* Workaround for testing: ignore ENOSPC for TMPFS/RAMFS.
* This is insignificant for production, but it helps in some tests using /dev/shm inside docker/containers. */
if (err == ENOSPC && osal_check_fs_incore(fd) == MDBX_RESULT_TRUE)
err = MDBX_RESULT_TRUE;
#if MDBX_USE_FALLOCATE
struct stat info;
if (unlikely(fstat(fd, &info)))
return errno;
const uint64_t allocated = UINT64_C(512) * info.st_blocks;
if (length > allocated) {
#if defined(__APPLE__)
fstore_t store = {
.fst_flags = F_ALLOCATECONTIG, .fst_posmode = F_PEOFPOSMODE, .fst_offset = 0, .fst_length = length};
int err = MDBX_SUCCESS;
if (fcntl(fd, F_PREALLOCATE, &store)) {
/* TODO: implement step-by-step allocation in chunks of 16384, 8192, 4094, 2048, 1024 Kb */
store.fst_flags = F_ALLOCATEALL;
if (fcntl(fd, F_PREALLOCATE, &store))
err = errno;
}
#elif defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
const int err = posix_fallocate(fd, 0, length);
if (!err && length > (uint64_t)info.st_size)
info.st_size = length /* posix_fallocate() extends the file */;
#else
const int err = fallocate(fd, 0, 0, length) ? errno : MDBX_SUCCESS;
if (!err && length > (uint64_t)info.st_size)
info.st_size = length /* fallocate() extends the file */;
#endif
if (unlikely(err) && ignore_enosys_and_eremote(err) != MDBX_RESULT_TRUE) {
/* Workaround for testing: ignore ENOSPC for TMPFS/RAMFS.
* This is insignificant for production, but it helps in some tests using /dev/shm inside docker/containers. */
if (err != ENOSPC || osal_check_fs_incore(fd) != MDBX_RESULT_TRUE)
return err;
}
}
if (length == (uint64_t)info.st_size)
return MDBX_SUCCESS;
#endif
#if defined(__linux__) || defined(__gnu_linux__)
if (globals.linux_kernel_version < 0x05110000 && globals.linux_kernel_version >= 0x050a0000) {
struct statfs statfs_info;
if (fstatfs(fd, &statfs_info))
return errno;
if (statfs_info.f_type == 0xEF53 /* EXT4_SUPER_MAGIC */ && unlikely(fdatasync(fd)))
return errno;
}
#endif /* Linux */
return unlikely(ftruncate(fd, length)) ? errno : MDBX_SUCCESS;
#endif /* !Windows */
return (err == MDBX_RESULT_TRUE) ? osal_ftruncate(fd, length) : err;
}
int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) {
@@ -1662,7 +1677,7 @@ int osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_r
int osal_thread_join(osal_thread_t thread) {
#if defined(_WIN32) || defined(_WIN64)
DWORD code = WaitForSingleObject(thread, INFINITE);
return waitstatus2errcode(code);
return osal_waitstatus2errcode(code);
#else
void *unused_retval = &unused_retval;
return pthread_join(thread, &unused_retval);
@@ -2087,8 +2102,8 @@ int osal_mmap(const int flags, osal_mmap_t *map, size_t size, const size_t limit
return err;
if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_SETLENGTH) != 0) {
err = osal_fallocate(map->fd, size);
VERBOSE("ftruncate %zu, err %d", size, err);
err = osal_fsetsize(map->fd, size);
VERBOSE("osal_fsetsize %zu, err %d", size, err);
if (err != MDBX_SUCCESS)
return err;
map->filesize = size;
@@ -2333,7 +2348,7 @@ retry_file_and_section:
}
if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) {
err = osal_fallocate(map->fd, size);
err = osal_fsetsize(map->fd, size);
if (err == MDBX_SUCCESS)
map->filesize = size;
/* ignore error, because Windows unable shrink file
@@ -2413,11 +2428,11 @@ retry_mapview:;
} else {
if (map->filesize != size) {
if (size > map->filesize) {
rc = osal_fallocate(map->fd, size);
VERBOSE("f%s-%s %zu, err %d", "allocate", "extend", size, rc);
rc = osal_fsetsize(map->fd, size);
VERBOSE("osal_fsetsize-%s %zu, err %d", "extend", size, rc);
} else if (flags & txn_shrink_allowed) {
rc = osal_ftruncate(map->fd, size);
VERBOSE("f%s-%s %zu, err %d", "truncate", "shrink", size, rc);
rc = osal_fsetsize(map->fd, size);
VERBOSE("osal_fsetsize-%s %zu, err %d", "shrink", size, rc);
}
if (unlikely(rc != MDBX_SUCCESS))
return rc;
@@ -3458,6 +3473,35 @@ bin128_t osal_guid(const MDBX_env *env) {
return uuid;
}
const char *osal_getenv(const char *name, bool secure) {
(void)secure;
#if defined(_WIN32) || defined(_WIN64)
static char buf[42];
SetLastError(ERROR_OUT_OF_PAPER);
const size_t len = GetEnvironmentVariableA(name, buf, sizeof(buf));
if (len >= sizeof(buf))
/* no idea haw to handle */
return nullptr;
if (len != 0)
return buf;
switch (GetLastError()) {
case ERROR_OUT_OF_PAPER:
return "";
default:
/* no idea to do in case of other error */
case ERROR_ENVVAR_NOT_FOUND:
return nullptr;
}
return (GetLastError() == ERROR_ENVVAR_NOT_FOUND) ? nullptr : "";
#else
#if defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 17)
if (secure)
return secure_getenv(name);
#endif /* glibc >= 2.17 */
return getenv(name);
#endif
}
/*--------------------------------------------------------------------------*/
void osal_ctor(void) {
@@ -3472,14 +3516,26 @@ void osal_ctor(void) {
SYSTEM_INFO si;
GetSystemInfo(&si);
globals.sys_pagesize = si.dwPageSize;
globals.sys_allocation_granularity = si.dwAllocationGranularity;
globals.sys_allocation_granularity =
(si.dwAllocationGranularity > globals.sys_pagesize) ? si.dwAllocationGranularity : globals.sys_pagesize;
#else
globals.sys_pagesize = sysconf(_SC_PAGE_SIZE);
globals.sys_allocation_granularity = (MDBX_WORDBITS > 32) ? 65536 : 16384;
globals.sys_allocation_granularity = (globals.sys_allocation_granularity >= globals.sys_pagesize * 2)
? globals.sys_allocation_granularity
: globals.sys_pagesize * 4;
#ifdef AT_UCACHEBSIZE
const size_t unified_cache_block_size = getauxval(AT_UCACHEBSIZE);
globals.sys_unified_cache_block = globals.sys_pagesize;
if (unified_cache_block_size > 0 && unified_cache_block_size < INT_MAX) {
globals.sys_unified_cache_block = (unsigned)unified_cache_block_size;
if (globals.sys_unified_cache_block > globals.sys_pagesize)
globals.sys_allocation_granularity = globals.sys_unified_cache_block;
}
#endif /* AT_UCACHEBSIZE */
#endif
if (globals.sys_allocation_granularity > 4 * MEGABYTE && globals.sys_pagesize < MEGABYTE)
globals.sys_allocation_granularity = 4 * MEGABYTE;
assert(globals.sys_pagesize > 0 && (globals.sys_pagesize & (globals.sys_pagesize - 1)) == 0);
assert(globals.sys_allocation_granularity >= globals.sys_pagesize &&
globals.sys_allocation_granularity % globals.sys_pagesize == 0);

View File

@@ -133,6 +133,14 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
#define vsnprintf _vsnprintf /* ntdll */
#endif
#ifndef strcasecmp
#define strcasecmp _stricmp /* ntdll */
#endif
#ifndef strncasecmp
#define strncasecmp _strnicmp /* ntdll */
#endif
#else /*----------------------------------------------------------------------*/
typedef pthread_t osal_thread_t;
@@ -171,7 +179,7 @@ typedef char pathchar_t;
#define MDBX_PRIsPATH "s"
#endif
static inline bool osal_yield(void) {
MDBX_MAYBE_UNUSED static inline bool osal_yield(void) {
#if defined(_WIN32) || defined(_WIN64)
return SleepEx(0, true) == WAIT_IO_COMPLETION;
#else
@@ -198,6 +206,23 @@ typedef struct osal_mmap {
#define MDBX_HAVE_PWRITEV 0
static inline int osal_waitstatus2errcode(DWORD result) {
switch (result) {
case WAIT_OBJECT_0:
return MDBX_SUCCESS;
case WAIT_FAILED:
return (int)GetLastError();
case WAIT_ABANDONED:
return ERROR_ABANDONED_WAIT_0;
case WAIT_IO_COMPLETION:
return ERROR_USER_APC;
case WAIT_TIMEOUT:
return ERROR_TIMEOUT;
default:
return ERROR_UNHANDLED_ERROR;
}
}
#elif defined(__ANDROID_API__)
#if __ANDROID_API__ < 24
@@ -441,8 +466,7 @@ enum osal_syncmode_bits {
};
MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd, const enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL int osal_fallocate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL int osal_fsetsize(mdbx_filehandle_t fd, const uint64_t length);
MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
@@ -468,6 +492,7 @@ MDBX_MAYBE_UNUSED static inline bool osal_isdirsep(pathchar_t c) {
c == '/';
}
MDBX_INTERNAL const char *osal_getenv(const char *name, bool secure);
MDBX_INTERNAL bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, size_t len);
MDBX_INTERNAL pathchar_t *osal_fileext(const pathchar_t *pathname, size_t len);
MDBX_INTERNAL int osal_fileexists(const pathchar_t *pathname);

View File

@@ -43,7 +43,7 @@
#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS)
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 /* Windows 7 */
#define _WIN32_WINNT 0x0A00 /* Windows 10 */
#endif /* _WIN32_WINNT */
#if !defined(_CRT_SECURE_NO_WARNINGS)
@@ -428,6 +428,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/time.h>
#include <sys/uio.h>
#if __GLIBC_PREREQ(2, 16) || __has_include(<sys/auxv.h>)
#include <sys/auxv.h>
#endif /* glibc >= 2.16 */
#endif /*---------------------------------------------------------------------*/
#if defined(__ANDROID_API__) || defined(ANDROID)

View File

@@ -88,7 +88,7 @@ MDBX_INTERNAL int txn_ro_end(MDBX_txn *txn, unsigned mode);
/* env.c */
MDBX_INTERNAL int env_open(MDBX_env *env, mdbx_mode_t mode);
MDBX_INTERNAL int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, size_t bytes, troika_t *troika);
MDBX_INTERNAL int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, troika_t *troika);
MDBX_INTERNAL int env_sync(MDBX_env *env, bool force, bool nonblock);
MDBX_INTERNAL int env_close(MDBX_env *env, bool resurrect_after_fork);
MDBX_INTERNAL MDBX_txn *env_owned_wrtxn(const MDBX_env *env);
@@ -109,8 +109,12 @@ MDBX_INTERNAL void recalculate_merge_thresholds(MDBX_env *env);
MDBX_INTERNAL void recalculate_subpage_thresholds(MDBX_env *env);
/* table.c */
MDBX_INTERNAL int __must_check_result tbl_fetch(MDBX_txn *txn, size_t dbi);
MDBX_INTERNAL int __must_check_result tbl_fetch(MDBX_txn *txn, MDBX_cursor *mc, size_t dbi, const MDBX_val *name,
unsigned wanna_flags);
MDBX_INTERNAL int __must_check_result tbl_create(MDBX_txn *txn, MDBX_cursor *mc, size_t slot, const MDBX_val *name,
unsigned db_flags);
MDBX_INTERNAL int __must_check_result tbl_setup(const MDBX_env *env, volatile kvx_t *const kvx, const tree_t *const db);
MDBX_INTERNAL int __must_check_result tbl_refresh(MDBX_txn *txn, size_t dbi);
/* coherency.c */
MDBX_INTERNAL bool coherency_check_meta(const MDBX_env *env, const volatile meta_t *meta, bool report);

View File

@@ -37,67 +37,99 @@ int tbl_setup(const MDBX_env *env, volatile kvx_t *const kvx, const tree_t *cons
return MDBX_SUCCESS;
}
int tbl_fetch(MDBX_txn *txn, size_t dbi) {
cursor_couple_t couple;
int rc = cursor_init(&couple.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
int tbl_fetch(MDBX_txn *txn, MDBX_cursor *mc, size_t dbi, const MDBX_val *name, unsigned wanna_flags) {
int err = cursor_init(mc, txn, MAIN_DBI);
if (unlikely(err != MDBX_SUCCESS))
return err;
kvx_t *const kvx = &txn->env->kvs[dbi];
rc = tree_search(&couple.outer, &kvx->name, 0);
if (unlikely(rc != MDBX_SUCCESS)) {
bailout:
NOTICE("dbi %zu refs to inaccessible table `%.*s` for txn %" PRIaTXN " (err %d)", dbi, (int)kvx->name.iov_len,
(const char *)kvx->name.iov_base, txn->txnid, rc);
return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc;
err = tree_search(mc, name, 0);
if (unlikely(err != MDBX_SUCCESS)) {
if (err == MDBX_NOTFOUND)
goto notfound;
return err;
}
struct node_search_result nsr = node_search(mc, name);
if (unlikely(!nsr.exact)) {
notfound:
if (dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID) && !(wanna_flags & MDBX_CREATE))
NOTICE("dbi %zu refs to non-existing table `%.*s` for txn %" PRIaTXN " (err %d)", dbi, (int)name->iov_len,
(const char *)name->iov_base, txn->txnid, err);
return MDBX_NOTFOUND;
}
if (unlikely((node_flags(nsr.node) & (N_DUP | N_TREE)) != N_TREE)) {
NOTICE("dbi %zu refs to not a named table `%.*s` for txn %" PRIaTXN " (%s)", dbi, (int)name->iov_len,
(const char *)name->iov_base, txn->txnid, "wrong node-flags");
return MDBX_INCOMPATIBLE /* not a named DB */;
}
MDBX_val data;
struct node_search_result nsr = node_search(&couple.outer, &kvx->name);
if (unlikely(!nsr.exact)) {
rc = MDBX_NOTFOUND;
goto bailout;
}
if (unlikely((node_flags(nsr.node) & (N_DUP | N_TREE)) != N_TREE)) {
NOTICE("dbi %zu refs to not a named table `%.*s` for txn %" PRIaTXN " (%s)", dbi, (int)kvx->name.iov_len,
(const char *)kvx->name.iov_base, txn->txnid, "wrong flags");
return MDBX_INCOMPATIBLE; /* not a named DB */
err = node_read(mc, nsr.node, &data, mc->pg[mc->top]);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (unlikely(data.iov_len < sizeof(tree_t))) {
NOTICE("dbi %zu refs to not a named table `%.*s` for txn %" PRIaTXN " (%s)", dbi, (int)name->iov_len,
(const char *)name->iov_base, txn->txnid, "wrong record-size");
return MDBX_INCOMPATIBLE /* not a named DB */;
}
rc = node_read(&couple.outer, nsr.node, &data, couple.outer.pg[couple.outer.top]);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(data.iov_len != sizeof(tree_t))) {
NOTICE("dbi %zu refs to not a named table `%.*s` for txn %" PRIaTXN " (%s)", dbi, (int)kvx->name.iov_len,
(const char *)kvx->name.iov_base, txn->txnid, "wrong rec-size");
return MDBX_INCOMPATIBLE; /* not a named DB */
}
uint16_t flags = UNALIGNED_PEEK_16(data.iov_base, tree_t, flags);
const unsigned db_flags = UNALIGNED_PEEK_16(data.iov_base, tree_t, flags);
const pgno_t db_root_pgno = peek_pgno(ptr_disp(data.iov_base, offsetof(tree_t, root)));
/* The txn may not know this DBI, or another process may
* have dropped and recreated the DB with other flags. */
tree_t *const db = &txn->dbs[dbi];
if (unlikely((db->flags & DB_PERSISTENT_FLAGS) != flags)) {
if (unlikely((wanna_flags ^ db_flags) & DB_PERSISTENT_FLAGS) && !(wanna_flags & MDBX_DB_ACCEDE) &&
!((wanna_flags & MDBX_CREATE) && db_root_pgno == P_INVALID)) {
NOTICE("dbi %zu refs to the re-created table `%.*s` for txn %" PRIaTXN
" with different flags (present 0x%X != wanna 0x%X)",
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base, txn->txnid, db->flags & DB_PERSISTENT_FLAGS,
flags);
return MDBX_INCOMPATIBLE;
dbi, (int)name->iov_len, (const char *)name->iov_base, txn->txnid, db_flags & DB_PERSISTENT_FLAGS,
wanna_flags & DB_PERSISTENT_FLAGS);
return MDBX_INCOMPATIBLE /* not a named DB */;
}
tree_t *const db = &txn->dbs[dbi];
memcpy(db, data.iov_base, sizeof(tree_t));
#if !MDBX_DISABLE_VALIDATION
const txnid_t pp_txnid = couple.outer.pg[couple.outer.top]->txnid;
tASSERT(txn, txn->front_txnid >= pp_txnid);
if (unlikely(db->mod_txnid > pp_txnid)) {
ERROR("db.mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", db->mod_txnid, pp_txnid);
const txnid_t maindb_leafpage_txnid = mc->pg[mc->top]->txnid;
tASSERT(txn, txn->front_txnid >= maindb_leafpage_txnid);
if (unlikely(db->mod_txnid > maindb_leafpage_txnid)) {
ERROR("db.mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", db->mod_txnid, maindb_leafpage_txnid);
return MDBX_CORRUPTED;
}
#endif /* !MDBX_DISABLE_VALIDATION */
rc = tbl_setup_ifneed(txn->env, kvx, db);
return MDBX_SUCCESS;
}
int tbl_create(MDBX_txn *txn, MDBX_cursor *mc, size_t slot, const MDBX_val *name, unsigned db_flags) {
tASSERT(txn, db_flags & MDBX_CREATE);
MDBX_val body;
body.iov_base = memset(&txn->dbs[slot], 0, body.iov_len = sizeof(tree_t));
txn->dbs[slot].root = P_INVALID;
txn->dbs[slot].mod_txnid = txn->txnid;
txn->dbs[slot].flags = db_flags & DB_PERSISTENT_FLAGS;
mc->next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = mc;
int err = cursor_put_checklen(mc, name, &body, N_TREE | MDBX_NOOVERWRITE);
txn->cursors[MAIN_DBI] = mc->next;
if (likely(err == MDBX_SUCCESS)) {
txn->flags |= MDBX_TXN_DIRTY;
tASSERT(txn, (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) != 0);
}
return err;
}
int tbl_refresh(MDBX_txn *txn, size_t dbi) {
cursor_couple_t couple;
kvx_t *const kvx = &txn->env->kvs[dbi];
int rc = tbl_fetch(txn, &couple.outer, dbi, &kvx->name, txn->dbs[dbi].flags);
if (likely(rc != MDBX_SUCCESS))
return dbi_gone(txn, dbi, rc);
rc = tbl_setup_ifneed(txn->env, kvx, &txn->dbs[dbi]);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
return dbi_gone(txn, dbi, rc);
if (unlikely(dbi_changed(txn, dbi)))
return MDBX_BAD_DBI;

View File

@@ -373,25 +373,27 @@ __cold static int rthc_drown(MDBX_env *const env) {
int rc = MDBX_SUCCESS;
MDBX_env *inprocess_neighbor = nullptr;
if (likely(env->lck_mmap.lck && current_pid == env->pid)) {
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d", (current_pid == env->pid) ? "cleanup" : "skip",
__Wpedantic_format_voidptr(env), env->pid, __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end),
current_pid);
bool cleaned = false;
for (reader_slot_t *r = begin; r < end; ++r) {
if (atomic_load32(&r->pid, mo_Relaxed) == current_pid) {
atomic_store32(&r->pid, 0, mo_AcquireRelease);
TRACE("== cleanup %p", __Wpedantic_format_voidptr(r));
cleaned = true;
}
}
if (cleaned)
atomic_store32(&env->lck_mmap.lck->rdt_refresh_flag, true, mo_Relaxed);
rc = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor);
if (!inprocess_neighbor && env->registered_reader_pid && env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
int err = lck_rpid_clear(env);
rc = rc ? rc : err;
if (!inprocess_neighbor) {
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d", (current_pid == env->pid) ? "cleanup" : "skip",
__Wpedantic_format_voidptr(env), env->pid, __Wpedantic_format_voidptr(begin),
__Wpedantic_format_voidptr(end), current_pid);
bool cleaned = false;
for (reader_slot_t *r = begin; r < end; ++r) {
if (atomic_load32(&r->pid, mo_Relaxed) == current_pid) {
atomic_store32(&r->pid, 0, mo_AcquireRelease);
TRACE("== cleanup %p", __Wpedantic_format_voidptr(r));
cleaned = true;
}
}
if (cleaned)
atomic_store32(&env->lck_mmap.lck->rdt_refresh_flag, true, mo_Relaxed);
if (env->registered_reader_pid && env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
int err = lck_rpid_clear(env);
rc = rc ? rc : err;
}
}
}
int err = lck_destroy(env, inprocess_neighbor, current_pid);
@@ -513,7 +515,9 @@ __cold void rthc_dtor(const uint32_t current_pid) {
MDBX_env *const env = rthc_table[i].env;
if (env->pid != current_pid)
continue;
if (!(env->flags & ENV_TXKEY))
if (!env->lck_mmap.lck || env->lck_mmap.base == MAP_FAILED)
continue;
if (!(env->flags & ENV_TXKEY) || !env->lck_mmap.lck)
continue;
env->flags -= ENV_TXKEY;
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];

View File

@@ -414,7 +414,6 @@ int main(int argc, char *argv[]) {
if (argc < 2)
usage(prog);
double elapsed;
#if defined(_WIN32) || defined(_WIN64)
uint64_t timestamp_start, timestamp_finish;
timestamp_start = GetMilliseconds();
@@ -652,23 +651,26 @@ bailout:
#if defined(_WIN32) || defined(_WIN64)
timestamp_finish = GetMilliseconds();
elapsed = (timestamp_finish - timestamp_start) * 1e-3;
const uint64_t elapsed_msec = (timestamp_finish - timestamp_start);
#else
if (clock_gettime(CLOCK_MONOTONIC, &timestamp_finish)) {
error_fn("clock_gettime", errno);
return EXIT_FAILURE_SYS;
}
elapsed =
timestamp_finish.tv_sec - timestamp_start.tv_sec + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9;
const uint64_t elapsed_msec = UINT64_C(1000) * (timestamp_finish.tv_sec - timestamp_start.tv_sec) +
(timestamp_finish.tv_nsec - timestamp_start.tv_nsec) / 1000000;
#endif /* !WINDOWS */
const size_t elapsed_seconds = (size_t)(elapsed_msec / 1000u);
const size_t elapsed_mod_ms = (size_t)(elapsed_msec % 1000u);
if (chk.result.total_problems) {
print_ln(MDBX_chk_result, "Total %" PRIuSIZE " error%s detected, elapsed %.3f seconds.", chk.result.total_problems,
(chk.result.total_problems > 1) ? "s are" : " is", elapsed);
print_ln(MDBX_chk_result, "Total %" PRIuSIZE " error%s detected, elapsed %zu.%03zu seconds.",
chk.result.total_problems, (chk.result.total_problems > 1) ? "s are" : " is", elapsed_seconds,
elapsed_mod_ms);
if (chk.result.problems_meta || chk.result.problems_kv || chk.result.problems_gc)
return EXIT_FAILURE_CHECK_MAJOR;
return EXIT_FAILURE_CHECK_MINOR;
}
print_ln(MDBX_chk_result, "No error is detected, elapsed %.3f seconds.", elapsed);
print_ln(MDBX_chk_result, "No error is detected, elapsed %zu.%03zu seconds.", elapsed_seconds, elapsed_mod_ms);
return EXIT_SUCCESS;
}

View File

@@ -341,8 +341,8 @@ int main(int argc, char *argv[]) {
}
print_stat(&mst);
pgno_t pages = 0, *iptr;
pgno_t reclaimable = 0;
size_t gc_pages = 0, *iptr;
size_t gc_reclaimable = 0;
MDBX_val key, data;
while (MDBX_SUCCESS == (rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT))) {
if (user_break) {
@@ -352,9 +352,9 @@ int main(int argc, char *argv[]) {
iptr = data.iov_base;
const pgno_t number = *iptr++;
pages += number;
gc_pages += number;
if (envinfo && mei.mi_latter_reader_txnid > *(txnid_t *)key.iov_base)
reclaimable += number;
gc_reclaimable += number;
if (freinfo > 1) {
char *bad = "";
@@ -402,36 +402,43 @@ int main(int argc, char *argv[]) {
}
if (envinfo) {
uint64_t value = mei.mi_mapsize / mei.mi_dxb_pagesize;
double percent = value / 100.0;
printf("Page Usage\n");
printf(" Total: %" PRIu64 " 100%%\n", value);
char buffer[64];
value = mei.mi_geo.current / mei.mi_dxb_pagesize;
printf(" Backed: %" PRIu64 " %.1f%%\n", value, value / percent);
puts("Page Usage");
const size_t total_pages = mei.mi_mapsize / mei.mi_dxb_pagesize;
printf(" Total: %" PRIuSIZE " 100%%\n", total_pages);
value = mei.mi_last_pgno + 1;
printf(" Allocated: %" PRIu64 " %.1f%%\n", value, value / percent);
const size_t backed_pages = mei.mi_geo.current / mei.mi_dxb_pagesize;
printf(" Backed: %" PRIuSIZE " %s%%\n", backed_pages,
mdbx_ratio2percents(backed_pages, total_pages, buffer, sizeof(buffer)));
value = mei.mi_mapsize / mei.mi_dxb_pagesize - (mei.mi_last_pgno + 1);
printf(" Remained: %" PRIu64 " %.1f%%\n", value, value / percent);
const size_t allocated_pages = mei.mi_last_pgno + 1;
printf(" Allocated: %" PRIuSIZE " %s%%\n", allocated_pages,
mdbx_ratio2percents(allocated_pages, total_pages, buffer, sizeof(buffer)));
value = mei.mi_last_pgno + 1 - pages;
printf(" Used: %" PRIu64 " %.1f%%\n", value, value / percent);
const size_t remained_pages = total_pages - allocated_pages;
printf(" Remained: %" PRIuSIZE " %s%%\n", remained_pages,
mdbx_ratio2percents(remained_pages, total_pages, buffer, sizeof(buffer)));
value = pages;
printf(" GC: %" PRIu64 " %.1f%%\n", value, value / percent);
const size_t used_pages = allocated_pages - gc_pages;
printf(" Used: %" PRIuSIZE " %s%%\n", used_pages,
mdbx_ratio2percents(used_pages, total_pages, buffer, sizeof(buffer)));
value = pages - reclaimable;
printf(" Retained: %" PRIu64 " %.1f%%\n", value, value / percent);
printf(" GC: %" PRIuSIZE " %s%%\n", gc_pages,
mdbx_ratio2percents(gc_pages, total_pages, buffer, sizeof(buffer)));
value = reclaimable;
printf(" Reclaimable: %" PRIu64 " %.1f%%\n", value, value / percent);
printf(" Reclaimable: %" PRIuSIZE " %s%%\n", gc_reclaimable,
mdbx_ratio2percents(gc_reclaimable, total_pages, buffer, sizeof(buffer)));
value = mei.mi_mapsize / mei.mi_dxb_pagesize - (mei.mi_last_pgno + 1) + reclaimable;
printf(" Available: %" PRIu64 " %.1f%%\n", value, value / percent);
const size_t gc_retained = gc_pages - gc_reclaimable;
printf(" Retained: %" PRIuSIZE " %s%%\n", gc_retained,
mdbx_ratio2percents(gc_retained, total_pages, buffer, sizeof(buffer)));
const size_t available_pages = gc_reclaimable + remained_pages;
printf(" Available: %" PRIuSIZE " %s%%\n", available_pages,
mdbx_ratio2percents(available_pages, total_pages, buffer, sizeof(buffer)));
} else
printf(" GC: %" PRIaPGNO " pages\n", pages);
printf(" GC: %" PRIuSIZE " pages\n", gc_pages);
}
rc = mdbx_dbi_open(txn, table, MDBX_DB_ACCEDE, &dbi);

View File

@@ -39,7 +39,7 @@ __hot int tree_search(MDBX_cursor *mc, const MDBX_val *key, int flags) {
const size_t dbi = cursor_dbi(mc);
if (unlikely(*cursor_dbi_state(mc) & DBI_STALE)) {
err = tbl_fetch(mc->txn, dbi);
err = tbl_refresh_absent2baddbi(mc->txn, dbi);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
}

View File

@@ -168,6 +168,15 @@ int txn_ro_end(MDBX_txn *txn, unsigned mode) {
txn->ro.slot = nullptr;
else {
eASSERT(env, slot->pid.weak == env->pid);
if (unlikely(slot->pid.weak == 0)) {
txn->flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED | MDBX_TXN_OUSTED;
txn->owner = 0;
if (mode & TXN_END_FREE) {
txn->signature = 0;
osal_free(txn);
}
return LOG_IFERR(MDBX_BAD_RSLOT);
}
if (likely((txn->flags & MDBX_TXN_FINISHED) == 0)) {
if (likely((txn->flags & MDBX_TXN_PARKED) == 0)) {
ENSURE(env, txn->txnid >=

View File

@@ -47,7 +47,7 @@ int txn_shadow_cursors(const MDBX_txn *parent, const size_t dbi) {
int err = cursor_shadow(cursor, txn, dbi);
if (unlikely(err != MDBX_SUCCESS)) {
/* не получилось забекапить курсоры */
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE;
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO;
txn->flags |= MDBX_TXN_ERROR;
return err;
}

View File

@@ -41,3 +41,52 @@ MDBX_NOTHROW_CONST_FUNCTION uint64_t rrxmrrxmsx_0(uint64_t v) {
v *= UINT64_C(0x9FB21C651E98DF25);
return v ^ v >> 28;
}
__cold char *ratio2digits(const uint64_t v, const uint64_t d, ratio2digits_buffer_t *const buffer, int precision) {
assert(d > 0 && precision < 20);
char *const dot = buffer->string + 21;
uint64_t i = v / d, f = v % d, m = d;
char *tail = dot;
bool carry = m - f < m / 2;
if (precision > 0) {
*tail = '.';
do {
while (unlikely(f > UINT64_MAX / 10)) {
f >>= 1;
m >>= 1;
}
f *= 10;
assert(tail > buffer->string && tail < ARRAY_END(buffer->string) - 1);
*++tail = '0' + (char)(f / m);
f %= m;
} while (--precision && tail < ARRAY_END(buffer->string) - 1);
carry = m - f < m / 2;
for (char *scan = tail; carry && scan > dot; --scan)
*scan = (carry = *scan == '9') ? '0' : *scan + 1;
}
assert(tail > buffer->string && tail < ARRAY_END(buffer->string) - 1);
*++tail = '\0';
char *head = dot;
i += carry;
while (i > 9) {
assert(head > buffer->string && head < ARRAY_END(buffer->string));
*--head = '0' + (char)(i % 10);
i /= 10;
}
assert(head > buffer->string && head < ARRAY_END(buffer->string));
*--head = '0' + (char)i;
return head;
}
__cold char *ratio2percent(uint64_t value, uint64_t whole, ratio2digits_buffer_t *buffer) {
while (unlikely(value > UINT64_MAX / 100)) {
value >>= 1;
whole >>= 1;
}
const bool rough = whole >= value && (!value || value > whole / 16);
return ratio2digits(value * 100, whole, buffer, rough ? 1 : 2);
}

View File

@@ -76,3 +76,10 @@ MDBX_MAYBE_UNUSED static inline uint64_t monotime_since_cached(uint64_t begin_ti
}
return cache->value - begin_timestamp;
}
typedef struct ratio2digits_buffer {
char string[1 + 20 + 1 + 19 + 1];
} ratio2digits_buffer_t;
char *ratio2digits(const uint64_t v, const uint64_t d, ratio2digits_buffer_t *const buffer, int precision);
char *ratio2percent(const uint64_t v, const uint64_t d, ratio2digits_buffer_t *const buffer);

View File

@@ -8,7 +8,7 @@ DIR="$(dirname ${BASH_SOURCE[0]})"
TEST="${DIR}/stochastic.sh --skip-make --db-upto-gb 32"
PREFIX="/dev/shm/mdbxtest-"
NUMACTL="$(which numactl 2>-)"
NUMACTL="$(which numactl 2>&-)"
NUMALIST=()
NUMAIDX=0
if [ -n "${NUMACTL}" -a $(${NUMACTL} --hardware | grep 'node [0-9]\+ cpus' | wc -l) -gt 1 ]; then

View File

@@ -3,10 +3,6 @@
#include "test.h++"
#if defined(_MSC_VER) && !defined(strcasecmp)
#define strcasecmp(str, len) _stricmp(str, len)
#endif /* _MSC_VER && strcasecmp() */
namespace config {
bool parse_option(int argc, char *const argv[], int &narg, const char *option, const char **value,

View File

@@ -22,9 +22,7 @@ void testcase_jitter::check_dbi_error(int expect, const char *stage) {
bool testcase_jitter::run() {
int err;
size_t upper_limit = config.params.size_upper;
if (upper_limit < 1)
upper_limit = config.params.size_now * 2;
size_t upper_limit = (config.params.size_upper < 1) ? config.params.size_now * 2 : config.params.size_upper;
tablename_buf buffer;
const char *const tablename = db_tablename(buffer);

View File

@@ -9,32 +9,15 @@ static std::unordered_map<unsigned, HANDLE> events;
static HANDLE hBarrierSemaphore, hBarrierEvent;
static HANDLE hProgressActiveEvent, hProgressPassiveEvent;
static int waitstatus2errcode(DWORD result) {
switch (result) {
case WAIT_OBJECT_0:
return MDBX_SUCCESS;
case WAIT_FAILED:
return GetLastError();
case WAIT_ABANDONED:
return ERROR_ABANDONED_WAIT_0;
case WAIT_IO_COMPLETION:
return ERROR_USER_APC;
case WAIT_TIMEOUT:
return ERROR_TIMEOUT;
default:
return ERROR_UNHANDLED_ERROR;
}
}
void osal_wait4barrier(void) {
DWORD rc = WaitForSingleObject(hBarrierSemaphore, 0);
switch (rc) {
default:
failure_perror("WaitForSingleObject(BarrierSemaphore)", waitstatus2errcode(rc));
failure_perror("WaitForSingleObject(BarrierSemaphore)", osal_waitstatus2errcode(rc));
case WAIT_OBJECT_0:
rc = WaitForSingleObject(hBarrierEvent, INFINITE);
if (rc != WAIT_OBJECT_0)
failure_perror("WaitForSingleObject(BarrierEvent)", waitstatus2errcode(rc));
failure_perror("WaitForSingleObject(BarrierEvent)", osal_waitstatus2errcode(rc));
break;
case WAIT_TIMEOUT:
if (!SetEvent(hBarrierEvent))
@@ -95,7 +78,7 @@ void osal_broadcast(unsigned id) {
int osal_waitfor(unsigned id) {
log_trace("osal_waitfor: event %u", id);
DWORD rc = WaitForSingleObject(events.at(id), INFINITE);
return waitstatus2errcode(rc);
return osal_waitstatus2errcode(rc);
}
int osal_delay(unsigned seconds) {
@@ -397,7 +380,7 @@ int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) {
return 0;
}
return waitstatus2errcode(rc);
return osal_waitstatus2errcode(rc);
}
}
@@ -418,7 +401,7 @@ void osal_udelay(size_t us) {
if (us > threshold_us && us > 1000) {
DWORD rc = SleepEx(unsigned(us / 1000), TRUE);
if (rc)
failure_perror("SleepEx()", waitstatus2errcode(rc));
failure_perror("SleepEx()", osal_waitstatus2errcode(rc));
us = 0;
}