mdbx: изменение лицензии и реструктуризация исходного кода.

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2024-05-19 22:07:58 +03:00
parent e9f5c0c308
commit 3de3d425a1
139 changed files with 34551 additions and 33907 deletions

34
AUTHORS
View File

@ -1,34 +0,0 @@
Contributors
============
- Alexey Naumov <alexey.naumov@gmail.com>
- Andrew Ashikhmin <andrey.ashikhmin@gmail.com>
- Chris Mikkelson <cmikk@qwest.net>
- Claude Brisson <claude.brisson@gmail.com>
- David Barbour <dmbarbour@gmail.com>
- David Wilson <dw@botanicus.net>
- dreamsxin <dreamsxin@126.com>
- Hallvard Furuseth <hallvard@openldap.org>, <h.b.furuseth@usit.uio.no>
- Heiko Becker <heirecka@exherbo.org>
- Howard Chu <hyc@openldap.org>, <hyc@symas.com>
- Ignacio Casal Quinteiro <ignacio.casal@nice-software.com>
- James Rouzier <rouzier@gmail.com>
- Jean-Christophe DUBOIS <jcd@tribudubois.net>
- John Hewson <john@jahewson.com>
- Klaus Malorny <klaus.malorny@knipp.de>
- Kurt Zeilenga <kurt.zeilenga@isode.com>
- Leonid Yuriev <leo@yuriev.ru>, <lyuryev@ptsecurity.ru>
- Lorenz Bauer <lmb@cloudflare.com>
- Luke Yeager <lyeager@nvidia.com>
- Martin Hedenfalk <martin@bzero.se>
- Ondrej Kuznik <ondrej.kuznik@acision.com>
- Orivej Desh <orivej@gmx.fr>
- Oskari Timperi <oskari.timperi@iki.fi>
- Pavel Medvedev <pmedvedev@gmail.com>
- Philipp Storz <philipp.storz@bareos.com>
- Quanah Gibson-Mount <quanah@openldap.org>
- Salvador Ortiz <sog@msg.com.mx>
- Sebastien Launay <sebastien@slaunay.fr>
- Vladimir Romanov <vromanov@gmail.com>
- Zano Foundation <crypto.sowle@gmail.com>
- 장세연 <sasgas@castis.com>

View File

@ -1,16 +1,5 @@
##
## Copyright 2020-2024 Leonid Yuriev <leo@yuriev.ru>
## and other libmdbx authors: please see AUTHORS file.
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted only as authorized by the OpenLDAP
## Public License.
##
## A copy of this license is available in the file LICENSE in the
## top-level directory of the distribution or, alternatively, at
## <http://www.OpenLDAP.org/license.html>.
##
## Copyright (c) 2020-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
## SPDX-License-Identifier: Apache-2.0
##
## libmdbx = { Revised and extended descendant of Symas LMDB. }
@ -69,14 +58,109 @@ else()
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/COPYRIGHT" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/NOTICE" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/README.md" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h++" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/CMakeLists.txt" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/core.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/alloy.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-cursor.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-env.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-extra.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-key-transform.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-txn.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/atomics-ops.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/atomics-types.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/audit.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/chk.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cogs.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cogs.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/coherency.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cold.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/config.h.in" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/copy.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cursor.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cursor.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dbi.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dbi.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/debug_begin.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/debug_end.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dpl.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dpl.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dxb.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/env-opts.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/env.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/essentials.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/gc-get.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/gc-put.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/gc.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/global.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/internals.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/layout-dxb.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/layout-lck.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck-posix.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck-windows.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/lck.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/logging_and_debug.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/logging_and_debug.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_chk.1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_copy.1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_drop.1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_dump.1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_load.1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1/mdbx_stat.1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mdbx.c++" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/meta.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/meta.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/misc.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mvcc-readers.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/node.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/node.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ntdll.def" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/options.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/osal.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/osal.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-get.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-iov.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-iov.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-ops.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-ops.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-search.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/pnl.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/pnl.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/preface.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/proto.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/range-estimate.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/refund.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/sort.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/subdb.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tls.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tls.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/chk.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/copy.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/drop.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/dump.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/load.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/stat.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/wingetopt.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/wingetopt.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tree.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/unaligned.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/utils.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/utils.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/version.c.in" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/man1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mdbx_chk.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mdbx.c++")
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/walk.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/walk.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/windows-import.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/windows-import.h")
set(MDBX_AMALGAMATED_SOURCE FALSE)
find_program(GIT git)
if(NOT GIT)
@ -84,21 +168,27 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND
endif()
set(MDBX_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/NOTICE" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.c++" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/config.h.in" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/man1" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_chk.c")
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h++" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_chk.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_copy.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_dump.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_load.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_stat.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx_drop.c" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ntdll.def" AND
EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/config.h.in")
set(MDBX_AMALGAMATED_SOURCE TRUE)
set(MDBX_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
else()
message(FATAL_ERROR "\n"
"Please don't use tarballs nor zips which are automatically provided by Github! "
"These archives do not contain version information and thus are unfit to build libmdbx. "
"You can vote for ability of disabling auto-creation such unsuitable archives at https://github.community/t/disable-tarball\n"
"Instead of above, just clone the git repository, either download a tarball or zip with the properly amalgamated source core. "
"For embedding libmdbx use a git-submodule or the amalgamated source code.\n"
"Please, avoid using any other techniques.")
"The set of libmdbx source code files is incomplete! "
"Instead just follow the https://libmdbx.dqdkfa.ru/usage.html "
"PLEASE, AVOID USING ANY OTHER TECHNIQUES.")
endif()
if(DEFINED PROJECT_NAME)
@ -600,13 +690,88 @@ else()
include_directories("${MDBX_SOURCE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}")
else()
list(APPEND LIBMDBX_SOURCES
"${MDBX_SOURCE_DIR}/api-cursor.c"
"${MDBX_SOURCE_DIR}/api-env.c"
"${MDBX_SOURCE_DIR}/api-extra.c"
"${MDBX_SOURCE_DIR}/api-key-transform.c"
"${MDBX_SOURCE_DIR}/api-txn.c"
"${MDBX_SOURCE_DIR}/atomics-ops.h"
"${MDBX_SOURCE_DIR}/atomics-types.h"
"${MDBX_SOURCE_DIR}/audit.c"
"${MDBX_SOURCE_DIR}/chk.c"
"${MDBX_SOURCE_DIR}/cogs.c"
"${MDBX_SOURCE_DIR}/cogs.h"
"${MDBX_SOURCE_DIR}/coherency.c"
"${MDBX_SOURCE_DIR}/cold.c"
"${MDBX_SOURCE_DIR}/copy.c"
"${MDBX_SOURCE_DIR}/cursor.c"
"${MDBX_SOURCE_DIR}/cursor.h"
"${MDBX_SOURCE_DIR}/dbi.c"
"${MDBX_SOURCE_DIR}/dbi.h"
"${MDBX_SOURCE_DIR}/dpl.c"
"${MDBX_SOURCE_DIR}/dpl.h"
"${MDBX_SOURCE_DIR}/dxb.c"
"${MDBX_SOURCE_DIR}/env-opts.c"
"${MDBX_SOURCE_DIR}/env.c"
"${MDBX_SOURCE_DIR}/essentials.h"
"${MDBX_SOURCE_DIR}/gc-get.c"
"${MDBX_SOURCE_DIR}/gc-put.c"
"${MDBX_SOURCE_DIR}/gc.h"
"${MDBX_SOURCE_DIR}/global.c"
"${MDBX_SOURCE_DIR}/internals.h"
"${MDBX_SOURCE_DIR}/layout-dxb.h"
"${MDBX_SOURCE_DIR}/layout-lck.h"
"${MDBX_SOURCE_DIR}/lck.c"
"${MDBX_SOURCE_DIR}/lck.h"
"${MDBX_SOURCE_DIR}/logging_and_debug.c"
"${MDBX_SOURCE_DIR}/logging_and_debug.h"
"${MDBX_SOURCE_DIR}/meta.c"
"${MDBX_SOURCE_DIR}/meta.h"
"${MDBX_SOURCE_DIR}/misc.c"
"${MDBX_SOURCE_DIR}/mvcc-readers.c"
"${MDBX_SOURCE_DIR}/node.c"
"${MDBX_SOURCE_DIR}/node.h"
"${MDBX_SOURCE_DIR}/options.h"
"${MDBX_SOURCE_DIR}/osal.c"
"${MDBX_SOURCE_DIR}/osal.h"
"${MDBX_SOURCE_DIR}/page-get.c"
"${MDBX_SOURCE_DIR}/page-iov.c"
"${MDBX_SOURCE_DIR}/page-iov.h"
"${MDBX_SOURCE_DIR}/page-ops.c"
"${MDBX_SOURCE_DIR}/page-ops.h"
"${MDBX_SOURCE_DIR}/page-search.c"
"${MDBX_SOURCE_DIR}/pnl.c"
"${MDBX_SOURCE_DIR}/pnl.h"
"${MDBX_SOURCE_DIR}/preface.h"
"${MDBX_SOURCE_DIR}/proto.h"
"${MDBX_SOURCE_DIR}/range-estimate.c"
"${MDBX_SOURCE_DIR}/refund.c"
"${MDBX_SOURCE_DIR}/sort.h"
"${MDBX_SOURCE_DIR}/spill.c"
"${MDBX_SOURCE_DIR}/spill.h"
"${MDBX_SOURCE_DIR}/subdb.c"
"${MDBX_SOURCE_DIR}/tls.c"
"${MDBX_SOURCE_DIR}/tls.h"
"${MDBX_SOURCE_DIR}/tree.c"
"${MDBX_SOURCE_DIR}/txl.c"
"${MDBX_SOURCE_DIR}/txl.h"
"${MDBX_SOURCE_DIR}/txn.c"
"${MDBX_SOURCE_DIR}/unaligned.h"
"${MDBX_SOURCE_DIR}/utils.c"
"${MDBX_SOURCE_DIR}/utils.h"
"${MDBX_SOURCE_DIR}/walk.c"
"${MDBX_SOURCE_DIR}/walk.h"
"${CMAKE_CURRENT_BINARY_DIR}/version.c"
"${MDBX_SOURCE_DIR}/options.h" "${MDBX_SOURCE_DIR}/base.h"
"${MDBX_SOURCE_DIR}/internals.h" "${MDBX_SOURCE_DIR}/osal.h"
"${MDBX_SOURCE_DIR}/core.c" "${MDBX_SOURCE_DIR}/osal.c"
"${MDBX_SOURCE_DIR}/lck-posix.c")
)
if(NOT MSVC)
list(APPEND LIBMDBX_SOURCES "${MDBX_SOURCE_DIR}/lck-posix.c")
endif()
if(NOT APPLE)
list(APPEND LIBMDBX_SOURCES "${MDBX_SOURCE_DIR}/lck-windows.c")
list(APPEND LIBMDBX_SOURCES
"${MDBX_SOURCE_DIR}/windows-import.h"
"${MDBX_SOURCE_DIR}/windows-import.c"
"${MDBX_SOURCE_DIR}/lck-windows.c"
)
endif()
include_directories("${MDBX_SOURCE_DIR}")
endif()
@ -747,20 +912,23 @@ endif()
# build mdbx-tools
if(MDBX_BUILD_TOOLS)
if(NOT MDBX_AMALGAMATED_SOURCE AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(WINGETOPT_SRC ${MDBX_SOURCE_DIR}/wingetopt.c ${MDBX_SOURCE_DIR}/wingetopt.h)
else()
set(WINGETOPT_SRC "")
set(WINGETOPT_SRC "")
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(WINGETOPT_SRC ${MDBX_SOURCE_DIR}/tools/wingetopt.c ${MDBX_SOURCE_DIR}/tools/wingetopt.h)
endif()
foreach(TOOL mdbx_chk mdbx_copy mdbx_stat mdbx_dump mdbx_load mdbx_drop)
add_executable(${TOOL} mdbx.h ${MDBX_SOURCE_DIR}/${TOOL}.c ${WINGETOPT_SRC})
foreach(TOOL chk copy stat dump load drop)
if(MDBX_AMALGAMATED_SOURCE)
add_executable(mdbx_${TOOL} mdbx.h ${MDBX_SOURCE_DIR}/mdbx_${TOOL}.c)
else()
add_executable(mdbx_${TOOL} mdbx.h ${MDBX_SOURCE_DIR}/tools/${TOOL}.c ${WINGETOPT_SRC})
endif()
if(NOT C_FALLBACK_GNU11 AND NOT C_FALLBACK_11)
set_target_properties(${TOOL} PROPERTIES
set_target_properties(mdbx_${TOOL} PROPERTIES
C_STANDARD ${MDBX_C_STANDARD} C_STANDARD_REQUIRED ON)
endif()
target_setup_options(${TOOL})
target_link_libraries(${TOOL} ${TOOL_MDBX_LIB})
target_setup_options(mdbx_${TOOL})
target_link_libraries(mdbx_${TOOL} ${TOOL_MDBX_LIB})
endforeach()
if(LIB_MATH)
target_link_libraries(mdbx_chk ${LIB_MATH})

158
COPYRIGHT
View File

@ -1,7 +1,138 @@
Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>.
Copyright 2011-2015 Howard Chu, Symas Corp.
Copyright 2015,2016 Peter-Service R&D LLC.
All rights reserved.
Copyright (c) 2015-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-----------------------------------------------------------------------
СМЕНА ЛИЦЕНЗИИ (THE LICENSE CHANGE)
OpenLDAP Public License → Apache 2.0
Briefly:
Historically, in 2015 an early MDBX source code was derived from the
"LMDB engine" created by Howard Chu <hyc@symas.com> in 2011-2015,
which based on btree.c written by Martin Hedenfalk <martin@bzero.se>.
By 2024, MDBX source code has actually been rewritten and has so
little in common with the original LMDB that I thought it admissible
to change the license. Below are more detailed explanations.
Кратко:
Исторически в 2015 году ранний исходный MDBX был заимствован из «LMDB
engine», созданной Howard Chu <hyc@symas.com> в 2011-2015, на основе
btree.c созданного Martin Hedenfalk <martin@bzero.se> в 2009-2010.
К 2024 году исходный код MDBX фактически переписан и имеет настолько
мало общего с первоначальным заимствованием из LMDB, что я счел
уместным сменить лицензию. Ниже более подробные пояснения.
---
Первоисточник текста формулирован на Русском языке, который является
родным для автора. Предполагается что все заинтересованные могут легко
воспользоваться машинным переводом, который при всех недостатках сможет
донести суть, намерения и местами даже передать тональность.
The original source of this text is in Russian, which is the author's
native language. It is assumed that all concerned can easily use machine
translation, which, with all the disadvantages, will be able to convey
the essence, intentions and, in some places, even convey the tonality of
a wording.
1. Причины
1.1. Лицензия Apache-2.0 является одной из самых популярных, так как
содержит ряд уточнений, проясняющих и упрощающих использование исходного
кода в производных работах и больших проектах. Эти особенности лицензии
Apache-2.0 я нахожу достаточно ценными и удобными. Соответственно,
переход на лицензию Apache-2.0 полезным в целом.
1.2. Проект OpenLDAP имеет определенную известность, в том числе, к
сожалению, среди специалистов славится кране плохим качеством кода и
сбоями при отходе от простых/базовых сценариев использования. Поэтому
использование лицензии OpenLDAP, в глазах части аудитории, бросает тень
на качества кода libmdbx, несмотря на то, что исходный код библиотеки
переписан, в том числе, с целью повышения качества, надежности,
стабильности и пригодности к тестированию.
Отмечу, что здесь не место для обсуждения объективности подобных мнений
и причин, равно как и не место для оценки компетентности специалистов
высказывающих такие суждения. Однако, здесь необходимо озвучить сам факт
наличия такой негативной коннотации качества кода при упоминании
OpenLDAP, совершенно без намерения как-либо задеть или обидеть
контрибьюторов OpenLDAP.
1.3. С точки зрения исходного кода, к настоящему времени libmdbx стала
совсем другим продуктом, о котором сейчас правильнее сказать что
разработка вдохновлена LMDB, нежели основывается на заимствовании кода.
Смена лицензии на переписанный код подчеркивает, что это действительно
новый исходный код.
2. Легитимность
2.1. Исходная лицензия OpenLDAP 2.8 и актуальная лицензия Apache 2.0
совпадают по базовым условиям. При этом лицензия Apache 2.0 уточняет,
определяет и проясняет многие аспекты. Поэтому смену лицензии я склонен
трактовать как уточнение, но как принципиальное изменение, которое
могло-бы нарушить чьи-либо права.
2.2. С процедурной точки зрения, у меня есть право сменить лицензию на
новый, написанный мной, исходный код. При этом объективно существует как
техническая, так и юридическая проблемы отделения «нового кода» от
«заимствованного», а также выделение/классификация кода, который
является общественным достоянием и/или общеупотребительным воплощением
«математических моделей и других публичных знаний».
Основываясь на собственной субъективной оценке кодовой базы, включая
соотношения «нового», «заимствованного» и «общеупотребительного»
исходного кода, я считаю что смена лицензии допустима. Одновременно с
этим, я понимаю и признаю, что можно найти повод, чтобы трактовать
ситуацию как «стакан наполовину полон/пуст». Поэтому декларирую
готовность принимать претензии и устранять их путем полного
переписывания оставшегося исходного кода, который попадает под критерии
«заимствованного» и кто-то из контрибьюторов которого будет против
изменения лицензии.
2.3. Вне зависимости от истории происхождения каждой строки исходного
кода и её буквального авторства, прошу не считать производимую смену
лицензии, и связанных с этим технических действий, как попытку плагиата,
присвоения чужого труда, присвоения авторства или принижения вклада
других авторов/контрибьторов. Безусловно проект MDBX/libmdbx не появился
бы без LMDB и всех участников проекта LMDB, в особенности Говарда Чу
(Howard Chu), Холлварда Фурусет (Hallvard Furuseth) и Мартина Хеденфок
(Martin Hedenfalk). Как-бы исходный код не переписывался он всё равно
будет основываться на базовых идеях и включать основные концепции LMDB.
3. Последствия и актуальные требования
Всё очень просто. Потребуется обеспечить требования новой лицензии в
соответствии с 4-м пунктом лицензции Apache 2.0.
В частности, при использовании/распространении libmdbx потребуется
обеспечить наличие файлов с текстом лицензии и файла NOTICE, а также
обеспечить пользователям возможность ознакомиться с их содержимым в
работах/продуктах использующих libmdbx.
-----------------------------------------------------------------------
Далее в справочных целях приведены уведомления об авторских правах из
первоначально заимствованного кода.
---
Original source code was derived from LMDB in 2015,
and later evolutionarily rewritten in 2015-2024:
Copyright (c) 2011-2015 Howard Chu, Symas Corp. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted only as authorized by the OpenLDAP
@ -11,12 +142,17 @@ A copy of this license is available in the file LICENSE in the
top-level directory of the distribution or, alternatively, at
<http://www.OpenLDAP.org/license.html>.
OpenLDAP is a registered trademark of the OpenLDAP Foundation.
LMDB itself devived code from btree.c written by Martin Hedenfalk:
Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
Individual files and/or contributed packages may be copyright by
other parties and/or subject to additional restrictions.
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
This work also contains materials derived from public sources.
Additional information about OpenLDAP can be obtained at
<http://www.openldap.org/>.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

View File

@ -61,7 +61,7 @@ MDBX_BUILD_CXX ?= YES
CFLAGS ?= $(strip $(eval CFLAGS := -std=gnu11 -O2 -g -Wall -Werror -Wextra -Wpedantic -ffunction-sections -fPIC -fvisibility=hidden -pthread -Wno-error=attributes $$(shell for opt in -fno-semantic-interposition -Wno-unused-command-line-argument -Wno-tautological-compare; do [ -z "$$$$($(CC) '-DMDBX_BUILD_FLAGS="probe"' $$$${opt} -c $(SRC_PROBE_C) -o /dev/null >/dev/null 2>&1 || echo failed)" ] && echo "$$$${opt} "; done)$(CFLAGS_EXTRA))$(CFLAGS))
# choosing C++ standard with variable expansion trick (seems this work two times per session for GNU Make 3.81)
CXXSTD ?= $(eval CXXSTD := $$(shell for std in gnu++23 c++23 gnu++2b c++2b gnu++20 c++20 gnu++2a c++2a gnu++17 c++17 gnu++1z c++1z gnu++14 c++14 gnu++1y c++1y gnu+11 c++11 gnu++0x c++0x; do $(CXX) -std=$$$${std} -c $(SRC_PROBE_CXX) -o /dev/null 2>probe4std-$$$${std}.err >/dev/null && echo "-std=$$$${std}" && exit; done))$(CXXSTD)
CXXSTD ?= $(eval CXXSTD := $$(shell for std in gnu++23 c++23 gnu++2b c++2b gnu++20 c++20 gnu++2a c++2a gnu++17 c++17 gnu++1z c++1z gnu++14 c++14 gnu++1y c++1y gnu+11 c++11 gnu++0x c++0x; do $(CXX) -std=$$$${std} -DMDBX_BUILD_CXX=1 -c $(SRC_PROBE_CXX) -o /dev/null 2>probe4std-$$$${std}.err >/dev/null && echo "-std=$$$${std}" && exit; done))$(CXXSTD)
CXXFLAGS ?= $(strip $(CXXSTD) $(filter-out -std=gnu11,$(CFLAGS)))
# libraries and options for linking
@ -121,7 +121,8 @@ endef
SO_SUFFIX := $(shell $(uname2sosuffix))
HEADERS := mdbx.h mdbx.h++
LIBRARIES := libmdbx.a libmdbx.$(SO_SUFFIX)
TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk mdbx_drop
TOOLS := chk copy drop dump load stat
MDBX_TOOLS := $(addprefix mdbx_,$(TOOLS))
MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 mdbx_chk.1 mdbx_drop.1
TIP := // TIP:
@ -148,7 +149,7 @@ else
$(info $(TIP) Use `make V=1` for verbose.)
endif
all: show-options $(LIBRARIES) $(TOOLS)
all: show-options $(LIBRARIES) $(MDBX_TOOLS)
help:
@echo " make all - build libraries and tools"
@ -234,26 +235,26 @@ options:
ifeq ($(wildcard mdbx.c),mdbx.c)
#< dist-cutoff-end
@echo "## in README and source code (see mdbx.c) if you do."
@grep -h '#ifndef MDBX_' mdbx.c | grep -v BUILD | uniq | sed 's/#ifndef / /'
@grep -h '#ifndef MDBX_' mdbx.c | grep -v BUILD | sort -u | sed 's/#ifndef / /'
#> dist-cutoff-begin
else
@echo "## in README and source code (see src/options.h) if you do."
@grep -h '#ifndef MDBX_' src/internals.h src/options.h | grep -v BUILD | uniq | sed 's/#ifndef / /'
@grep -h '#ifndef MDBX_' src/*.h | grep -v BUILD | sort -u | sed 's/#ifndef / /'
endif
#< dist-cutoff-end
lib libs libmdbx mdbx: libmdbx.a libmdbx.$(SO_SUFFIX)
tools: $(TOOLS)
tools-static: $(addsuffix .static,$(TOOLS)) $(addsuffix .static-lto,$(TOOLS))
tools: $(MDBX_TOOLS)
tools-static: $(addsuffix .static,$(MDBX_TOOLS)) $(addsuffix .static-lto,$(MDBX_TOOLS))
strip: all
@echo ' STRIP libmdbx.$(SO_SUFFIX) $(TOOLS)'
$(TRACE )strip libmdbx.$(SO_SUFFIX) $(TOOLS)
@echo ' STRIP libmdbx.$(SO_SUFFIX) $(MDBX_TOOLS)'
$(TRACE )strip libmdbx.$(SO_SUFFIX) $(MDBX_TOOLS)
clean:
@echo ' REMOVE ...'
$(QUIET)rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *.$(SO_SUFFIX) *.dSYM *~ tmp.db/* \
$(QUIET)rm -rf $(MDBX_TOOLS) mdbx_test @* *.[ao] *.[ls]o *.$(SO_SUFFIX) *.dSYM *~ tmp.db/* \
*.gcov *.log *.err src/*.o test/*.o mdbx_example dist \
config.h src/config.h src/version.c *.tar* buildflags.tag \
mdbx_*.static mdbx_*.static-lto
@ -284,27 +285,28 @@ ifeq ($(wildcard mdbx.c),mdbx.c)
# Amalgamated source code, i.e. distributed after `make dist`
MAN_SRCDIR := man1/
config.h: buildflags.tag mdbx.c $(lastword $(MAKEFILE_LIST))
config.h: buildflags.tag mdbx.c $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' MAKE $@'
$(QUIET)(echo '#define MDBX_BUILD_TIMESTAMP "$(MDBX_BUILD_TIMESTAMP)"' \
&& echo "#define MDBX_BUILD_FLAGS \"$$(cat buildflags.tag)\"" \
&& echo '#define MDBX_BUILD_COMPILER "$(shell (LC_ALL=C $(CC) --version || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
&& echo '#define MDBX_BUILD_TARGET "$(shell set -o pipefail; (LC_ALL=C $(CC) -v 2>&1 | grep -i '^Target:' | cut -d ' ' -f 2- || (LC_ALL=C $(CC) --version | grep -qi e2k && echo E2K) || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
&& echo '#define MDBX_BUILD_CXX $(call select_by,MDBX_BUILD_CXX,1,0)' \
) >$@
mdbx-dylib.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST))
mdbx-dylib.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' CC $@'
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c mdbx.c -o $@
mdbx-static.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST))
mdbx-static.o: config.h mdbx.c mdbx.h $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' CC $@'
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c mdbx.c -o $@
mdbx++-dylib.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
mdbx++-dylib.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' CC $@'
$(QUIET)$(CXX) $(CXXFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c mdbx.c++ -o $@
mdbx++-static.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
mdbx++-static.o: config.h mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' CC $@'
$(QUIET)$(CXX) $(CXXFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c mdbx.c++ -o $@
@ -351,9 +353,9 @@ define uname2titer
esac
endef
DIST_EXTRA := LICENSE README.md CMakeLists.txt GNUmakefile Makefile ChangeLog.md VERSION.txt config.h.in ntdll.def \
DIST_EXTRA := LICENSE NOTICE README.md CMakeLists.txt GNUmakefile Makefile ChangeLog.md VERSION.txt config.h.in ntdll.def \
$(addprefix man1/, $(MANPAGES)) cmake/compiler.cmake cmake/profile.cmake cmake/utils.cmake
DIST_SRC := mdbx.h mdbx.h++ mdbx.c mdbx.c++ $(addsuffix .c, $(TOOLS))
DIST_SRC := mdbx.h mdbx.h++ mdbx.c mdbx.c++ $(addsuffix .c, $(MDBX_TOOLS))
TEST_DB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.db
TEST_LOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log
@ -362,20 +364,20 @@ TEST_ITER := $(shell $(uname2titer))
TEST_SRC := test/osal-$(TEST_OSAL).c++ $(filter-out $(wildcard test/osal-*.c++),$(wildcard test/*.c++)) $(call select_by,MDBX_BUILD_CXX,,src/mdbx.c++)
TEST_INC := $(wildcard test/*.h++)
TEST_OBJ := $(patsubst %.c++,%.o,$(TEST_SRC))
TAR ?= $(shell which gnu-tar || echo tar)
TAR ?= $(shell which gnu-tar 2>&- || echo tar)
ZIP ?= $(shell which zip || echo "echo 'Please install zip'")
CLANG_FORMAT ?= $(shell (which clang-format-14 || which clang-format-13 || which clang-format) 2>/dev/null)
CLANG_FORMAT ?= $(shell (which clang-format-19 || which clang-format) 2>/dev/null)
reformat:
@echo ' RUNNING clang-format...'
$(QUIET)if [ -n "$(CLANG_FORMAT)" ]; then \
git ls-files | grep -E '\.(c|c++|h|h++)(\.in)?$$' | xargs -r $(CLANG_FORMAT) -i --style=file; \
else \
echo "clang-format version 13..14 not found for 'reformat'"; \
echo "clang-format version 19 not found for 'reformat'"; \
fi
MAN_SRCDIR := src/man1/
ALLOY_DEPS := $(shell git ls-files src/)
ALLOY_DEPS := $(shell git ls-files src/ | grep -e /tools -e /man -v)
git_DIR := $(shell if [ -d .git ]; then echo .git; elif [ -s .git -a -f .git ]; then grep '^gitdir: ' .git | cut -d ':' -f 2; else echo git_directory_is_absent; fi)
MDBX_GIT_VERSION = $(shell set -o pipefail; git describe --tags '--match=v[0-9]*' 2>&- | sed -n 's|^v*\([0-9]\{1,\}\.[0-9]\{1,\}\.[0-9]\{1,\}\)\(.*\)|\1|p' || echo 'Please fetch tags and/or use non-obsolete git version')
MDBX_GIT_REVISION = $(shell set -o pipefail; git rev-list `git describe --tags --abbrev=0`..HEAD --count 2>&- || echo 'Please fetch tags and/or use non-obsolete git version')
@ -392,11 +394,11 @@ MDBX_SMOKE_EXTRA ?=
check: DESTDIR = $(shell pwd)/@check-install
check: test dist install
smoke-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1)
smoke-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1 -UNDEBUG -DMDBX_DEBUG=0)
smoke-assertion: smoke
test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1)
test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1 -UNDEBUG -DMDBX_DEBUG=0)
test-assertion: smoke
long-test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1)
long-test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1 -UNDEBUG -DMDBX_DEBUG=0)
long-test-assertion: smoke
smoke: build-test
@ -424,7 +426,7 @@ smoke-fault: build-test
test: build-test
@echo ' RUNNING `test/long_stochastic.sh --loops 2`...'
$(QUIET)test/long_stochastic.sh --dont-check-ram-size --loops 2 --db-upto-mb 256 --extra --skip-make --taillog >$(TEST_LOG) || (cat $(TEST_LOG) && false)
$(QUIET)test/long_stochastic.sh --dont-check-ram-size --loops 2 --db-upto-mb 256 --skip-make --taillog >$(TEST_LOG) || (cat $(TEST_LOG) && false)
long-test: test-long
test-long: build-test
@ -439,7 +441,7 @@ test-valgrind: test-memcheck
test-memcheck: CFLAGS_EXTRA=-Ofast -DENABLE_MEMCHECK
test-memcheck: build-test
@echo ' RUNNING `test/long_stochastic.sh --with-valgrind --loops 2`...'
$(QUIET)test/long_stochastic.sh --with-valgrind --extra --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false)
$(QUIET)test/long_stochastic.sh --with-valgrind --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false)
memcheck: smoke-memcheck
smoke-memcheck: VALGRIND=valgrind --trace-children=yes --log-file=valgrind-%p.log --leak-check=full --track-origins=yes --read-var-info=yes --error-exitcode=42 --suppressions=test/valgrind_suppress.txt
@ -480,23 +482,27 @@ build-test: all mdbx_example mdbx_test
define test-rule
$(patsubst %.c++,%.o,$(1)): $(1) $(TEST_INC) $(HEADERS) $(lastword $(MAKEFILE_LIST))
@echo ' CC $$@'
$(QUIET)$$(CXX) $$(CXXFLAGS) $$(MDBX_BUILD_OPTIONS) -c $(1) -o $$@
$(QUIET)$$(CXX) $$(CXXFLAGS) $$(MDBX_BUILD_OPTIONS) -DMDBX_BUILD_CXX=1 -DMDBX_WITHOUT_MSVC_CRT=0 -c $(1) -o $$@
endef
$(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file))))
mdbx_%: src/mdbx_%.c libmdbx.a
@echo ' CC+LD $@'
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' $^ $(EXE_LDFLAGS) $(LIBS) -o $@
define tool-rule
mdbx_$(1): src/tools/$(1).c libmdbx.a
@echo ' CC+LD $$@'
$(QUIET)$$(CC) $$(CFLAGS) $$(MDBX_BUILD_OPTIONS) -Isrc '-DMDBX_CONFIG_H="config.h"' $$^ $$(EXE_LDFLAGS) $$(LIBS) -o $$@
mdbx_%.static: src/mdbx_%.c mdbx-static.o
@echo ' CC+LD $@'
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' $^ $(EXE_LDFLAGS) $(LIBS) -static -Wl,--strip-all -o $@
mdbx_$(1).static: src/tools/$(1).c mdbx-static.o
@echo ' CC+LD $$@'
$(QUIET)$$(CC) $$(CFLAGS) $$(MDBX_BUILD_OPTIONS) -Isrc '-DMDBX_CONFIG_H="config.h"' $$^ $$(EXE_LDFLAGS) $$(LIBS) -static -Wl,--strip-all -o $$@
mdbx_%.static-lto: src/mdbx_%.c src/config.h src/version.c src/alloy.c $(ALLOY_DEPS)
@echo ' CC+LD $@'
$(QUIET)$(CC) $(CFLAGS) -Os -flto $(MDBX_BUILD_OPTIONS) '-DLIBMDBX_API=' '-DMDBX_CONFIG_H="config.h"' \
$< src/alloy.c $(EXE_LDFLAGS) $(LIBS) -static -Wl,--strip-all -o $@
mdbx_$(1).static-lto: src/tools/$(1).c src/config.h src/version.c src/alloy.c $(ALLOY_DEPS)
@echo ' CC+LD $$@'
$(QUIET)$$(CC) $$(CFLAGS) -Os -flto $$(MDBX_BUILD_OPTIONS) -Isrc '-DLIBMDBX_API=' '-DMDBX_CONFIG_H="config.h"' \
$$< src/alloy.c $$(EXE_LDFLAGS) $$(LIBS) -static -Wl,--strip-all -o $$@
endef
$(foreach file,$(TOOLS),$(eval $(call tool-rule,$(file))))
mdbx_test: $(TEST_OBJ) libmdbx.$(SO_SUFFIX)
@echo ' LD $@'
@ -506,16 +512,13 @@ $(git_DIR)/HEAD $(git_DIR)/index $(git_DIR)/refs/tags:
@echo '*** ' >&2
@echo '*** Please don''t use tarballs nor zips which are automatically provided by Github !' >&2
@echo '*** These archives do not contain version information and thus are unfit to build libmdbx.' >&2
@echo '*** You can vote for ability of disabling auto-creation such unsuitable archives at https://github.community/t/disable-tarball' >&2
@echo '*** ' >&2
@echo '*** Instead of above, just clone the git repository, either download a tarball or zip with the properly amalgamated source core.' >&2
@echo '*** For embedding libmdbx use a git-submodule or the amalgamated source code.' >&2
@echo '*** ' >&2
@echo '*** Please, avoid using any other techniques.' >&2
@echo '*** Instead just follow the https://libmdbx.dqdkfa.ru/usage.html' >&2
@echo '*** PLEASE, AVOID USING ANY OTHER TECHNIQUES.' >&2
@echo '*** ' >&2
@false
src/version.c: src/version.c.in $(lastword $(MAKEFILE_LIST)) $(git_DIR)/HEAD $(git_DIR)/index $(git_DIR)/refs/tags
src/version.c: src/version.c.in $(lastword $(MAKEFILE_LIST)) $(git_DIR)/HEAD $(git_DIR)/index $(git_DIR)/refs/tags LICENSE NOTICE
@echo ' MAKE $@'
$(QUIET)sed \
-e "s|@MDBX_GIT_TIMESTAMP@|$(MDBX_GIT_TIMESTAMP)|" \
@ -528,20 +531,21 @@ src/version.c: src/version.c.in $(lastword $(MAKEFILE_LIST)) $(git_DIR)/HEAD $(g
-e "s|\$${MDBX_VERSION_REVISION}|$(MDBX_GIT_REVISION)|" \
src/version.c.in >$@
src/config.h: buildflags.tag src/version.c $(lastword $(MAKEFILE_LIST))
src/config.h: buildflags.tag src/version.c $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' MAKE $@'
$(QUIET)(echo '#define MDBX_BUILD_TIMESTAMP "$(MDBX_BUILD_TIMESTAMP)"' \
&& echo "#define MDBX_BUILD_FLAGS \"$$(cat buildflags.tag)\"" \
&& echo '#define MDBX_BUILD_COMPILER "$(shell (LC_ALL=C $(CC) --version || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
&& echo '#define MDBX_BUILD_TARGET "$(shell set -o pipefail; (LC_ALL=C $(CC) -v 2>&1 | grep -i '^Target:' | cut -d ' ' -f 2- || (LC_ALL=C $(CC) --version | grep -qi e2k && echo E2K) || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \
&& echo '#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)' \
&& echo '#define MDBX_BUILD_CXX $(call select_by,MDBX_BUILD_CXX,1,0)' \
) >$@
mdbx-dylib.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
mdbx-dylib.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' CC $@'
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c src/alloy.c -o $@
mdbx-static.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
mdbx-static.o: src/config.h src/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST)) LICENSE NOTICE
@echo ' CC $@'
$(QUIET)$(CC) $(CFLAGS) $(MDBX_BUILD_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c src/alloy.c -o $@
@ -570,9 +574,9 @@ docs/contrib.fame: src/version.c $(lastword $(MAKEFILE_LIST))
@echo ' MAKE $@'
$(QUIET)echo "" > $@ && git fame --show-email --format=md --silent-progress -w -M -C | grep '^|' >> $@
docs/overall.md: docs/__overview.md docs/_toc.md docs/__mithril.md docs/__history.md AUTHORS docs/contrib.fame LICENSE $(lastword $(MAKEFILE_LIST))
docs/overall.md: docs/__overview.md docs/_toc.md docs/__mithril.md docs/__history.md COPYRIGHT LICENSE NOTICE $(lastword $(MAKEFILE_LIST))
@echo ' MAKE $@'
$(QUIET)echo -e "\\mainpage Overall\n\\section brief Brief" | cat - $(filter %.md, $^) >$@ && echo -e "\n\n\nLicense\n=======\n" | cat AUTHORS docs/contrib.fame - LICENSE >>$@
$(QUIET)echo -e "\\mainpage Overall\n\\section brief Brief" | cat - $(filter %.md, $^) >$@ && echo -e "\n\n\nLicense\n=======\n" | cat - LICENSE >>$@
docs/intro.md: docs/_preface.md docs/__characteristics.md docs/__improvements.md docs/_restrictions.md docs/__performance.md
@echo ' MAKE $@'
@ -582,11 +586,11 @@ docs/usage.md: docs/__usage.md docs/_starting.md docs/__bindings.md
@echo ' MAKE $@'
$(QUIET)echo -e "\\page usage Usage\n\\section getting Building & Embedding" | cat - $^ | sed 's/^Bindings$$/Bindings {#bindings}/' >$@
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md AUTHORS LICENSE $(lastword $(MAKEFILE_LIST))
doxygen: docs/Doxyfile docs/overall.md docs/intro.md docs/usage.md mdbx.h mdbx.h++ src/options.h ChangeLog.md COPYRIGHT LICENSE NOTICE $(lastword $(MAKEFILE_LIST))
@echo ' RUNNING doxygen...'
$(QUIET)rm -rf docs/html && \
cat mdbx.h | tr '\n' '\r' | sed -e 's/LIBMDBX_INLINE_API\s*(\s*\([^,]\+\),\s*\([^,]\+\),\s*(\s*\([^)]\+\)\s*)\s*)\s*{/inline \1 \2(\3) {/g' | tr '\r' '\n' >docs/mdbx.h && \
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp AUTHORS LICENSE docs/html/
cp mdbx.h++ src/options.h ChangeLog.md docs/ && (cd docs && doxygen Doxyfile $(HUSH)) && cp COPYRIGHT LICENSE NOTICE docs/html/
mdbx++-dylib.o: src/config.h src/mdbx.c++ mdbx.h mdbx.h++ $(lastword $(MAKEFILE_LIST))
@echo ' CC $@'
@ -617,7 +621,7 @@ release-assets: libmdbx-amalgamated-$(MDBX_GIT_VERSION).zpaq \
dist-checked.tag: $(addprefix dist/, $(DIST_SRC) $(DIST_EXTRA))
@echo -n ' VERIFY amalgamated sources...'
$(QUIET)rm -rf $@ dist/@tmp-shared_internals.inc \
$(QUIET)rm -rf $@ dist/@tmp-essentials.inc dist/@tmp-internals.inc \
&& if grep -R "define xMDBX_ALLOY" dist | grep -q MDBX_BUILD_SOURCERY; then echo "sed output is WRONG!" >&2; exit 2; fi \
&& rm -rf dist-check && cp -r -p dist dist-check && ($(MAKE) IOARENA=false CXXSTD=$(CXXSTD) -C dist-check >dist-check.log 2>dist-check.err || (cat dist-check.err && exit 1)) \
&& touch $@ || (echo " FAILED! See dist-check.log and dist-check.err" >&2; exit 2) && echo " Ok"
@ -634,7 +638,6 @@ dist-checked.tag: $(addprefix dist/, $(DIST_SRC) $(DIST_EXTRA))
@echo ' CREATE $@'
$(QUIET)$(TAR) -c $(shell LC_ALL=C $(TAR) --help | grep -q -- '--owner' && echo '--owner=0 --group=0') -f - -C dist $(DIST_SRC) $(DIST_EXTRA) | bzip2 -9 -z >$@
%.zip: dist-checked.tag
@echo ' CREATE $@'
$(QUIET)rm -rf $@ && (cd dist && $(ZIP) -9 ../$@ $(DIST_SRC) $(DIST_EXTRA)) &>zip.log
@ -643,52 +646,81 @@ dist-checked.tag: $(addprefix dist/, $(DIST_SRC) $(DIST_EXTRA))
@echo ' CREATE $@'
$(QUIET)rm -rf $@ && (cd dist && zpaq a ../$@ $(DIST_SRC) $(DIST_EXTRA) -m59) &>zpaq.log
dist/mdbx.h: mdbx.h src/version.c $(lastword $(MAKEFILE_LIST))
@echo ' COPY $@'
$(QUIET)mkdir -p dist && cp $< $@
dist/mdbx.h++: mdbx.h++ src/version.c $(lastword $(MAKEFILE_LIST))
@echo ' COPY $@'
$(QUIET)mkdir -p dist && cp $< $@
dist/@tmp-shared_internals.inc: src/version.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
dist/@tmp-essentials.inc: src/version.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
@echo ' ALLOYING...'
$(QUIET)mkdir -p dist \
&& echo '#define xMDBX_ALLOY 1' >dist/@tmp-sed.inc && echo '#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)' >>dist/@tmp-sed.inc \
&& (grep -v '#include ' src/alloy.c && echo '#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)' \
&& sed \
-e '/#pragma once/r dist/@tmp-sed.inc' \
-e 's|#include "../mdbx.h"|@INCLUDE "mdbx.h"|' \
-e '/#include "base.h"/r src/base.h' \
-e '/#include "preface.h"/r src/preface.h' \
-e '/#include "osal.h"/r src/osal.h' \
-e '/#include "options.h"/r src/options.h' \
-e '/#include "atomics-types.h"/r src/atomics-types.h' \
-e '/#include "layout-dxb.h"/r src/layout-dxb.h' \
-e '/#include "layout-lck.h"/r src/layout-lck.h' \
-e '/#include "logging_and_debug.h"/r src/logging_and_debug.h' \
-e '/#include "utils.h"/r src/utils.h' \
-e '/#include "pnl.h"/r src/pnl.h' \
src/essentials.h \
| sed \
-e '/#pragma once/d' -e '/#include "/d' \
-e '/ clang-format o/d' -e '/ \*INDENT-O/d' \
src/internals.h >$@ \
&& rm -rf dist/@tmp-sed.inc
| grep -v '^/// ') >$@
dist/mdbx.c: dist/@tmp-shared_internals.inc $(lastword $(MAKEFILE_LIST))
dist/@tmp-internals.inc: dist/@tmp-essentials.inc src/version.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST))
$(QUIET)(cat dist/@tmp-essentials.inc \
&& sed \
-e '/#include "essentials.h"/d' \
-e '/#include "atomics-ops.h"/r src/atomics-ops.h' \
-e '/#include "proto.h"/r src/proto.h' \
-e '/#include "txl.h"/r src/txl.h' \
-e '/#include "unaligned.h"/r src/unaligned.h' \
-e '/#include "cogs.h"/r src/cogs.h' \
-e '/#include "cursor.h"/r src/cursor.h' \
-e '/#include "dbi.h"/r src/dbi.h' \
-e '/#include "dpl.h"/r src/dpl.h' \
-e '/#include "gc.h"/r src/gc.h' \
-e '/#include "lck.h"/r src/lck.h' \
-e '/#include "meta.h"/r src/meta.h' \
-e '/#include "node.h"/r src/node.h' \
-e '/#include "page-iov.h"/r src/page-iov.h' \
-e '/#include "page-ops.h"/r src/page-ops.h' \
-e '/#include "spill.h"/r src/spill.h' \
-e '/#include "sort.h"/r src/sort.h' \
-e '/#include "tls.h"/r src/tls.h' \
-e '/#include "walk.h"/r src/walk.h' \
-e '/#include "windows-import.h"/r src/windows-import.h' \
src/internals.h \
| sed \
-e '/#pragma once/d' -e '/#include "/d' \
-e '/ clang-format o/d' -e '/ \*INDENT-O/d' \
| grep -v '^/// ') >$@
dist/mdbx.c: dist/@tmp-internals.inc $(lastword $(MAKEFILE_LIST))
@echo ' MAKE $@'
$(QUIET)mkdir -p dist && (cat dist/@tmp-shared_internals.inc \
&& cat src/core.c src/osal.c src/version.c src/lck-windows.c src/lck-posix.c | sed \
$(QUIET)(cat dist/@tmp-internals.inc $(shell git ls-files src/*.c | grep -v alloy) src/version.c | sed \
-e '/#include "debug_begin.h"/r src/debug_begin.h' \
-e '/#include "debug_end.h"/r src/debug_end.h' \
) | sed -e '/#include "/d;/#pragma once/d' -e 's|@INCLUDE|#include|' \
-e '/ clang-format o/d;/ \*INDENT-O/d' >$@
dist/mdbx.c++: dist/@tmp-shared_internals.inc src/mdbx.c++ $(lastword $(MAKEFILE_LIST))
dist/mdbx.c++: dist/@tmp-essentials.inc src/mdbx.c++ $(lastword $(MAKEFILE_LIST))
@echo ' MAKE $@'
$(QUIET)mkdir -p dist && (cat dist/@tmp-shared_internals.inc && cat src/mdbx.c++) \
| sed -e '/#include "/d;/#pragma once/d' -e 's|@INCLUDE|#include|;s|"mdbx.h"|"mdbx.h++"|' \
$(QUIET)cat dist/@tmp-essentials.inc src/mdbx.c++ | sed \
-e '/#define xMDBX_ALLOY/d' \
-e '/#include "/d;/#pragma once/d' \
-e 's|@INCLUDE|#include|;s|"mdbx.h"|"mdbx.h++"|' \
-e '/ clang-format o/d;/ \*INDENT-O/d' >$@
define dist-tool-rule
dist/$(1).c: src/$(1).c src/wingetopt.h src/wingetopt.c \
dist/@tmp-shared_internals.inc $(lastword $(MAKEFILE_LIST))
dist/mdbx_$(1).c: src/tools/$(1).c src/tools/wingetopt.h src/tools/wingetopt.c \
dist/@tmp-internals.inc $(lastword $(MAKEFILE_LIST))
@echo ' MAKE $$@'
$(QUIET)mkdir -p dist && sed \
-e '/#include "internals.h"/r dist/@tmp-shared_internals.inc' \
-e '/#include "wingetopt.h"/r src/wingetopt.c' \
-e '/#include "essentials.h"/r dist/@tmp-essentials.inc' \
-e '/#include "wingetopt.h"/r src/tools/wingetopt.c' \
-e '/ clang-format o/d' -e '/ \*INDENT-O/d' \
src/$(1).c \
src/tools/$(1).c \
| sed -e '/#include "/d;/#pragma once/d;/#define xMDBX_ALLOY/d' -e 's|@INCLUDE|#include|' \
-e '/ clang-format o/d;/ \*INDENT-O/d' >$$@
@ -696,12 +728,12 @@ endef
$(foreach file,$(TOOLS),$(eval $(call dist-tool-rule,$(file))))
define dist-extra-rule
dist/$(1): $(1)
dist/$(1): $(1) src/version.c $(lastword $(MAKEFILE_LIST))
@echo ' REFINE $$@'
$(QUIET)mkdir -p $$(dir $$@) && sed -e '/^#> dist-cutoff-begin/,/^#< dist-cutoff-end/d' $$< >$$@
endef
$(foreach file,$(filter-out man1/% VERSION.txt %.in ntdll.def,$(DIST_EXTRA)),$(eval $(call dist-extra-rule,$(file))))
$(foreach file,mdbx.h mdbx.h++ $(filter-out man1/% VERSION.txt %.in ntdll.def,$(DIST_EXTRA)),$(eval $(call dist-extra-rule,$(file))))
dist/VERSION.txt: src/version.c
@echo ' MAKE $@'
@ -763,10 +795,10 @@ cross-qemu:
#< dist-cutoff-end
install: $(LIBRARIES) $(TOOLS) $(HEADERS)
install: $(LIBRARIES) $(MDBX_TOOLS) $(HEADERS)
@echo ' INSTALLING...'
$(QUIET)mkdir -p $(DESTDIR)$(prefix)/bin$(suffix) && \
$(INSTALL) -p $(EXE_INSTALL_FLAGS) $(TOOLS) $(DESTDIR)$(prefix)/bin$(suffix)/ && \
$(INSTALL) -p $(EXE_INSTALL_FLAGS) $(MDBX_TOOLS) $(DESTDIR)$(prefix)/bin$(suffix)/ && \
mkdir -p $(DESTDIR)$(prefix)/lib$(suffix)/ && \
$(INSTALL) -p $(EXE_INSTALL_FLAGS) $(filter-out libmdbx.a,$(LIBRARIES)) $(DESTDIR)$(prefix)/lib$(suffix)/ && \
mkdir -p $(DESTDIR)$(prefix)/lib$(suffix)/ && \
@ -784,7 +816,7 @@ install-no-strip: install
uninstall:
@echo ' UNINSTALLING/REMOVE...'
$(QUIET)rm -f $(addprefix $(DESTDIR)$(prefix)/bin$(suffix)/,$(TOOLS)) \
$(QUIET)rm -f $(addprefix $(DESTDIR)$(prefix)/bin$(suffix)/,$(MDBX_TOOLS)) \
$(addprefix $(DESTDIR)$(prefix)/lib$(suffix)/,$(LIBRARIES)) \
$(addprefix $(DESTDIR)$(prefix)/include/,$(HEADERS)) \
$(addprefix $(DESTDIR)$(mandir)/man1/,$(MANPAGES))

206
LICENSE
View File

@ -1,47 +1,177 @@
The OpenLDAP Public License
Version 2.8, 17 August 2003
Redistribution and use of this software and associated documentation
("Software"), with or without modification, are permitted provided
that the following conditions are met:
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
1. Redistributions in source form must retain copyright statements
and notices,
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
2. Redistributions in binary form must reproduce applicable copyright
statements and notices, this list of conditions, and the following
disclaimer in the documentation and/or other materials provided
with the distribution, and
1. Definitions.
3. Redistributions must contain a verbatim copy of this document.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
The OpenLDAP Foundation may revise this license from time to time.
Each revision is distinguished by a version number. You may use
this Software under terms of this license revision or under the
terms of any subsequent revision of the license.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS
CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S)
OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
The names of the authors and copyright holders must not be used in
advertising or otherwise to promote the sale, use or other dealing
in this Software without specific, written prior permission. Title
to copyright in this Software shall at all times remain with copyright
holders.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
OpenLDAP is a registered trademark of the OpenLDAP Foundation.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
Copyright 1999-2003 The OpenLDAP Foundation, Redwood City,
California, USA. All Rights Reserved. Permission to copy and
distribute verbatim copies of this document is granted.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

23
NOTICE Normal file
View File

@ -0,0 +1,23 @@
libmdbx (aka MDBX) is an extremely fast, compact, powerful, embeddedable,
transactional key-value storage engine with open-source code. MDBX has a
specific set of properties and capabilities, focused on creating unique
lightweight solutions.
Please visit https://libmdbx.dqdkfa.ru for more information, changelog,
documentation, C++ API description and links to the original git repo
with the source code. Questions, feedback and suggestions are welcome
to the Telegram' group https://t.me/libmdbx.
Since 2017 _libmdbx_ development is funded by [Positive Technologies](https://www.ptsecurity.com)
and used inside company products. Всё будет хорошо!
Copyright 2015-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
SPDX-License-Identifier: Apache-2.0
For notes about the license change, credits and acknowledgments,
please refer to the COPYRIGHT file within original libmdbx source code
repository https://gitflic.ru/project/erthink/libmdbx
On 2022-04-15 the Github administration, without any warning nor
explanation, deleted _libmdbx_ along with a lot of other projects,
simultaneously blocking access for many developers.
For the same reason ~~Github~~ is blacklisted forever.

View File

@ -1,36 +1,5 @@
<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences -->
### Время учить Русский
Начиная с 2021 года наблюдается устойчивые тенденции к распространению
недостоверной информации о _libmdbx_ в странах ~~НАТО~~,
политизированной критика, а также отказу от использования библиотеки в
пользу LMDB, несмотря на явные проблемы с одной стороны и преимущества с
другой. Поэтому начиная с 17 марта 2024 года прекращается
документирование и сопровождение проекта на английском языке. Новый
функционал будет документироваться только на русском языке, однако,
целенаправленного переписывания/перевода документации пока не
планируется.
### The origin has been migrated to [GitFlic](https://gitflic.ru/project/erthink/libmdbx)
Since on 2022-04-15 the Github administration, without any warning
nor explanation, deleted _libmdbx_ along with a lot of other projects,
simultaneously blocking access for many developers.
For the same reason ~~Github~~ is blacklisted forever.
GitFlic's developers plan to support other languages,
including English 和 中文, in the near future.
### Основной репозиторий перемещен на [GitFlic](https://gitflic.ru/project/erthink/libmdbx)
Так как 15 апреля 2022 администрация Github без предупреждения и
объяснения причин удалила _libmdbx_ вместе с массой других проектов,
одновременно заблокировав доступ многим разработчикам.
По этой же причине ~~Github~~ навсегда занесен в черный список.
--------------------------------------------------------------------------------
*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.*
> Please refer to the online [documentation](https://libmdbx.dqdkfa.ru)
> with [`C` API description](https://libmdbx.dqdkfa.ru/group__c__api.html)
> and pay attention to the [`C++` API](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h%2B%2B#line-num-1).
@ -40,6 +9,8 @@ including English 和 中文, in the near future.
> For NEWS take a look to the [ChangeLog](https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md)
> or the [TODO](https://gitflic.ru/project/erthink/libmdbx/blob?file=TODO.md).
*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.*
libmdbx
========
@ -48,7 +19,7 @@ libmdbx
_libmdbx_ is an extremely fast, compact, powerful, embedded, transactional
[key-value database](https://en.wikipedia.org/wiki/Key-value_database),
with [permissive license](https://gitflic.ru/project/erthink/libmdbx/blob?file=LICENSE).
with [Apache 2.0 license](https://gitflic.ru/project/erthink/libmdbx/blob?file=LICENSE).
_libmdbx_ has a specific set of properties and capabilities,
focused on creating unique lightweight solutions.
@ -144,15 +115,14 @@ $ objdump -f -h -j .text libmdbx.so
libmdbx.so: формат файла elf64-e2k
архитектура: elbrus-v6:64, флаги 0x00000150:
HAS_SYMS, DYNAMIC, D_PAGED
начальный адрес 0x0000000000021680
начальный адрес 0x00000000??????00
Разделы:
Idx Name Разм VMA LMA Фа смещ. Выр.
10 .text 000ddd28 0000000000021680 0000000000021680 00021680 2**3
CONTENTS, ALLOC, LOAD, READONLY, CODE
Idx Name Разм VMA LMA Фа смещ. Выр. Флаги
10 .text 000e7460 0000000000025c00 0000000000025c00 00025c00 2**10 CONTENTS, ALLOC, LOAD, READONLY, CODE
$ cc --version
lcc:1.26.12:Jun-05-2022:e2k-v6-linux
lcc:1.27.14:Jan-31-2024:e2k-v6-linux
gcc (GCC) 9.3.0 compatible
```
@ -276,7 +246,7 @@ out-of-the-box, not silently and catastrophically break down. The list
below is pruned down to the improvements most notable and obvious from
the user's point of view.
## Added Features
## Some Added Features
1. Keys could be more than 2 times longer than _LMDB_.
> For DB with default page size _libmdbx_ support keys up to 2022 bytes
@ -319,8 +289,7 @@ be found between a `KEY1` and a `KEY2`. This is a prerequisite for build
and/or optimize query execution plans.
> _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys.
8. `mdbx_chk` utility for database integrity check.
Since version 0.9.1, the utility supports checking the database using any of the three meta pages and the ability to switch to it.
8. Database integrity check API both with standalone `mdbx_chk` utility.
9. Support for opening databases in the exclusive mode, including on a network share.
@ -410,12 +379,26 @@ The origin for now is at [GitFlic](https://gitflic.ru/project/erthink/libmdbx)
with backup at [ABF by ROSA Лаб](https://abf.rosalinux.ru/erthink/libmdbx).
For the same reason ~~Github~~ is blacklisted forever.
Начиная с 2021 года наблюдаются устойчивые тенденции к распространению
недостоверной информации о libmdbx в странах НАТО, политизированной
критики, а также отказу от использования библиотеки в пользу LMDB,
несмотря на явные проблемы с одной стороны и преимущества с другой.
Поэтому, начиная с 17 марта 2024 года, прекращается документирование и
сопровождение проекта на английском языке. Новая функциональность будет
документироваться только на русском языке, однако, целенаправленного
переписывания/перевода документации пока не планируется.
Since May 2024 and version v0.13 _libmdbx_ was re-licensed under Apache-2.0 license.
Please refer to the `COPYRIGHT` file for license change explanations.
## Acknowledgments
Howard Chu <hyc@openldap.org> is the author of LMDB, from which
originated the _libmdbx_ in 2015.
Howard Chu <hyc@openldap.org> and Hallvard Furuseth
<hallvard@openldap.org> are the authors of _LMDB_, from which _libmdbx_
was forked in 2015.
Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which
was used to begin development of LMDB.
was used to begin development of _LMDB_.
<!-- section-end -->
@ -523,8 +506,10 @@ There are no special traits nor quirks if you use libmdbx ONLY inside the single
But in a cross-container cases or with a host-container(s) mix the two major things MUST be
guaranteed:
1. Coherence of memory mapping content and unified page cache inside OS kernel for host and all container(s) operated with a DB.
Basically this means must be only a single physical copy of each memory mapped DB' page in the system memory.
1. Coherence of memory mapping content and unified page cache inside OS
kernel for host and all container(s) operated with a DB. Basically this
means must be only a single physical copy of each memory mapped DB' page
in the system memory.
2. Uniqueness of [PID](https://en.wikipedia.org/wiki/Process_identifier) values and/or a common space for ones:
- for POSIX systems: PID uniqueness for all processes operated with a DB.

View File

@ -1,17 +1,5 @@
## Copyright (c) 2012-2024 Leonid Yuriev <leo@yuriev.ru>.
##
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
## Copyright (c) 2010-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
## SPDX-License-Identifier: Apache-2.0
if(CMAKE_VERSION VERSION_LESS 3.8.2)
cmake_minimum_required(VERSION 3.0.2)

View File

@ -1,17 +1,5 @@
## Copyright (c) 2012-2024 Leonid Yuriev <leo@yuriev.ru>.
##
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
## Copyright (c) 2012-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
## SPDX-License-Identifier: Apache-2.0
if(CMAKE_VERSION VERSION_LESS 3.8.2)
cmake_minimum_required(VERSION 3.0.2)

View File

@ -1,17 +1,5 @@
## Copyright (c) 2012-2024 Leonid Yuriev <leo@yuriev.ru>.
##
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
## Copyright (c) 2012-2024 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
## SPDX-License-Identifier: Apache-2.0
if(CMAKE_VERSION VERSION_LESS 3.8.2)
cmake_minimum_required(VERSION 3.0.2)

313
mdbx.h
View File

@ -1,11 +1,10 @@
/**
_libmdbx_ is an extremely fast, compact, powerful, embedded,
_libmdbx_ (aka MDBX) is an extremely fast, compact, powerful, embeddable,
transactional [key-value
store](https://en.wikipedia.org/wiki/Key-value_database) database, with
[permissive license](./LICENSE). _MDBX_ has a specific set of properties and
capabilities, focused on creating unique lightweight solutions with
extraordinary performance.
store](https://en.wikipedia.org/wiki/Key-value_database), with [Apache 2.0
license](./LICENSE). _MDBX_ has a specific set of properties and capabilities,
focused on creating unique lightweight solutions with extraordinary performance.
_libmdbx_ is superior to [LMDB](https://bit.ly/26ts7tL) in terms of features
and reliability, not inferior in performance. In comparison to LMDB, _libmdbx_
@ -14,60 +13,24 @@ break down. _libmdbx_ supports Linux, Windows, MacOS, OSX, iOS, Android,
FreeBSD, DragonFly, Solaris, OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other
systems compliant with POSIX.1-2008.
The origin has been migrated to
[GitFlic](https://gitflic.ru/project/erthink/libmdbx) since on 2022-04-15
the Github administration, without any warning nor explanation, deleted libmdbx
along with a lot of other projects, simultaneously blocking access for many
developers. For the same reason ~~Github~~ is blacklisted forever.
Please visit https://libmdbx.dqdkfa.ru for more information, documentation,
C++ API description and links to the origin git repo with the source code.
Questions, feedback and suggestions are welcome to the Telegram' group
https://t.me/libmdbx.
_The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо._
\note The origin has been migrated to
[GitFlic](https://gitflic.ru/project/erthink/libmdbx) since on 2022-04-15 the
Github administration, without any warning nor explanation, deleted libmdbx
along with a lot of other projects, simultaneously blocking access for many
developers. For the same reason ~~Github~~ is blacklisted forever.
\section copyright LICENSE & COPYRIGHT
\authors Copyright (c) 2015-2024, Leonid Yuriev <leo@yuriev.ru>
and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file.
\copyright Redistribution and use in source and binary forms, with or without
modification, are permitted only as authorized by the OpenLDAP Public License.
A copy of this license is available in the file LICENSE in the
top-level directory of the distribution or, alternatively, at
<http://www.OpenLDAP.org/license.html>.
---
This code is derived from "LMDB engine" written by
Howard Chu (Symas Corporation), which itself derived from btree.c
written by Martin Hedenfalk.
---
Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted only as authorized by the OpenLDAP
Public License.
A copy of this license is available in the file LICENSE in the
top-level directory of the distribution or, alternatively, at
<http://www.OpenLDAP.org/license.html>.
---
Portions Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\copyright SPDX-License-Identifier: Apache-2.0
\note Please refer to the COPYRIGHT file for explanations license change,
credits and acknowledgments.
\author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
*******************************************************************************/
@ -98,7 +61,7 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
/* clang-format off */
/**
\file mdbx.h
\brief The libmdbx C API header file
\brief The libmdbx C API header file.
\defgroup c_api C API
@{
@ -359,6 +322,14 @@ typedef mode_t mdbx_mode_t;
#endif
#endif /* MDBX_DEPRECATED */
#ifndef MDBX_DEPRECATED_ENUM
#if !defined(DOXYGEN) && (!defined(_MSC_VER) || _MSC_VER >= 1930)
#define MDBX_DEPRECATED_ENUM MDBX_DEPRECATED
#else
#define MDBX_DEPRECATED_ENUM /* avoid madness MSVC */
#endif
#endif /* MDBX_DEPRECATED_ENUM */
#ifndef __dll_export
#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) || \
defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
@ -393,7 +364,8 @@ typedef mode_t mdbx_mode_t;
/** \brief Auxiliary macro for robustly define the both inline version of API
* function and non-inline fallback dll-exported version for applications linked
* with old version of libmdbx, with a strictly ODR-common implementation. */
* with old version of libmdbx, with a strictly ODR-common implementation. Thus,
* we emulate __extern_inline for all compilers, including non-GNU ones. */
#if defined(LIBMDBX_INTERNALS) && !defined(LIBMDBX_NO_EXPORTS_LEGACY_API)
#define LIBMDBX_INLINE_API(TYPE, NAME, ARGS) \
/* proto of exported which uses common impl */ LIBMDBX_API TYPE NAME ARGS; \
@ -888,7 +860,7 @@ enum MDBX_constants {
/** Log level
* \note Levels detailed than (great than) \ref MDBX_LOG_NOTICE
* requires build libmdbx with \ref MDBX_DEBUG option. */
enum MDBX_log_level_t {
typedef enum MDBX_log_level {
/** Critical conditions, i.e. assertion failures.
* \note libmdbx always produces such messages regardless
* of \ref MDBX_DEBUG build option. */
@ -938,17 +910,14 @@ enum MDBX_log_level_t {
/** for \ref mdbx_setup_debug() only: Don't change current settings */
MDBX_LOG_DONTCHANGE = -1
};
#ifndef __cplusplus
typedef enum MDBX_log_level_t MDBX_log_level_t;
#endif
} MDBX_log_level_t;
/** \brief Runtime debug flags
*
* \details `MDBX_DBG_DUMP` and `MDBX_DBG_LEGACY_MULTIOPEN` always have an
* effect, but `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` and `MDBX_DBG_JITTER` only if
* libmdbx built with \ref MDBX_DEBUG. */
enum MDBX_debug_flags_t {
typedef enum MDBX_debug_flags {
MDBX_DBG_NONE = 0,
/** Enable assertion checks.
@ -986,12 +955,8 @@ enum MDBX_debug_flags_t {
/** for mdbx_setup_debug() only: Don't change current settings */
MDBX_DBG_DONTCHANGE = -1
};
#ifndef __cplusplus
typedef enum MDBX_debug_flags_t MDBX_debug_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t)
#endif
} MDBX_debug_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags)
/** \brief A debug-logger callback function,
* called before printing the message and aborting.
@ -1086,7 +1051,7 @@ MDBX_NORETURN LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env,
* \ingroup c_opening
* \anchor env_flags
* \see mdbx_env_open() \see mdbx_env_set_flags() */
enum MDBX_env_flags_t {
typedef enum MDBX_env_flags {
MDBX_ENV_DEFAULTS = 0,
/** Extra validation of DB structure and pages content.
@ -1210,7 +1175,7 @@ enum MDBX_env_flags_t {
/** Отвязывает транзакции от потоков/threads насколько это возможно.
*
* Эта опция предназначена для приложений, которые мультиплексируют множество
* Опция предназначена для приложений, которые мультиплексируют множество
* пользовательских легковесных потоков выполнения по отдельным потокам
* операционной системы, например как это происходит в средах выполнения
* GoLang и Rust. Таким приложениям также рекомендуется сериализовать
@ -1278,10 +1243,9 @@ enum MDBX_env_flags_t {
* Этот флаг вступает в силу при открытии среды и не может быть изменен после.
*/
MDBX_NOSTICKYTHREADS = UINT32_C(0x200000),
#ifndef _MSC_VER /* avoid madness MSVC */
/** \deprecated Please use \ref MDBX_NOSTICKYTHREADS instead. */
MDBX_NOTLS MDBX_DEPRECATED = MDBX_NOSTICKYTHREADS,
#endif /* avoid madness MSVC */
MDBX_NOTLS MDBX_DEPRECATED_ENUM = MDBX_NOSTICKYTHREADS,
/** Don't do readahead.
*
@ -1327,7 +1291,6 @@ enum MDBX_env_flags_t {
* This flag may be changed at any time using `mdbx_env_set_flags()`. */
MDBX_NOMEMINIT = UINT32_C(0x1000000),
#ifndef _MSC_VER /* avoid madness MSVC */
/** Aims to coalesce a Garbage Collection items.
* \deprecated Always enabled since v0.12 and deprecated since v0.13.
*
@ -1339,8 +1302,7 @@ enum MDBX_env_flags_t {
* Unallocated space and reducing the database file.
*
* This flag may be changed at any time using mdbx_env_set_flags(). */
MDBX_COALESCE MDBX_DEPRECATED = UINT32_C(0x2000000),
#endif /* avoid madness MSVC */
MDBX_COALESCE MDBX_DEPRECATED_ENUM = UINT32_C(0x2000000),
/** LIFO policy for recycling a Garbage Collection items.
*
@ -1543,19 +1505,14 @@ enum MDBX_env_flags_t {
MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | UINT32_C(0x100000),
/** end of sync_modes @} */
};
#ifndef __cplusplus
/** \ingroup c_opening */
typedef enum MDBX_env_flags_t MDBX_env_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_env_flags_t)
#endif
} MDBX_env_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_env_flags)
/** Transaction flags
* \ingroup c_transactions
* \anchor txn_flags
* \see mdbx_txn_begin() \see mdbx_txn_flags() */
enum MDBX_txn_flags_t {
typedef enum MDBX_txn_flags {
/** Start read-write transaction.
*
* Only one write transaction may be active at a time. Writes are fully
@ -1627,18 +1584,14 @@ enum MDBX_txn_flags_t {
* \note Transaction state flag. Returned from \ref mdbx_txn_flags()
* but can't be used with \ref mdbx_txn_begin(). */
MDBX_TXN_BLOCKED = MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD
};
#ifndef __cplusplus
typedef enum MDBX_txn_flags_t MDBX_txn_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_txn_flags_t)
#endif
} MDBX_txn_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_txn_flags)
/** \brief Database flags
* \ingroup c_dbi
* \anchor db_flags
* \see mdbx_dbi_open() */
enum MDBX_db_flags_t {
typedef enum MDBX_db_flags {
/** Variable length unique keys with usual byte-by-byte string comparison. */
MDBX_DB_DEFAULTS = 0,
@ -1681,19 +1634,14 @@ enum MDBX_db_flags_t {
* sub-database will be opened with flags which it was created, and then an
* application could determine the actual flags by \ref mdbx_dbi_flags(). */
MDBX_DB_ACCEDE = MDBX_ACCEDE
};
#ifndef __cplusplus
/** \ingroup c_dbi */
typedef enum MDBX_db_flags_t MDBX_db_flags_t;
#else
} MDBX_db_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_db_flags_t)
#endif
/** \brief Data changing flags
* \ingroup c_crud
* \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations"
* \see mdbx_put() \see mdbx_cursor_put() \see mdbx_replace() */
enum MDBX_put_flags_t {
typedef enum MDBX_put_flags {
/** Upsertion by default (without any other flags) */
MDBX_UPSERT = 0,
@ -1731,18 +1679,13 @@ enum MDBX_put_flags_t {
/** Only for \ref MDBX_DUPFIXED.
* Store multiple data items in one call. */
MDBX_MULTIPLE = UINT32_C(0x80000)
};
#ifndef __cplusplus
/** \ingroup c_crud */
typedef enum MDBX_put_flags_t MDBX_put_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_put_flags_t)
#endif
} MDBX_put_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_put_flags)
/** \brief Environment copy flags
* \ingroup c_extra
* \see mdbx_env_copy() \see mdbx_env_copy2fd() */
enum MDBX_copy_flags_t {
typedef enum MDBX_copy_flags {
MDBX_CP_DEFAULTS = 0,
/** Copy with compactification: Omit free space from copy and renumber all
@ -1751,19 +1694,14 @@ enum MDBX_copy_flags_t {
/** Force to make resizable copy, i.e. dynamic size instead of fixed */
MDBX_CP_FORCE_DYNAMIC_SIZE = 2u
};
#ifndef __cplusplus
/** \ingroup c_extra */
typedef enum MDBX_copy_flags_t MDBX_copy_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_copy_flags_t)
#endif
} MDBX_copy_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_copy_flags)
/** \brief Cursor operations
* \ingroup c_cursors
* This is the set of all operations for retrieving data using a cursor.
* \see mdbx_cursor_get() */
enum MDBX_cursor_op {
typedef enum MDBX_cursor_op {
/** Position at first key/data item */
MDBX_FIRST,
@ -1875,18 +1813,14 @@ enum MDBX_cursor_op {
MDBX_TO_PAIR_EQUAL,
MDBX_TO_PAIR_GREATER_OR_EQUAL,
MDBX_TO_PAIR_GREATER_THAN
};
#ifndef __cplusplus
/** \ingroup c_cursors */
typedef enum MDBX_cursor_op MDBX_cursor_op;
#endif
} MDBX_cursor_op;
/** \brief Errors and return codes
* \ingroup c_err
*
* BerkeleyDB uses -30800 to -30999, we'll go under them
* \see mdbx_strerror() \see mdbx_strerror_r() \see mdbx_liberr2str() */
enum MDBX_error_t {
typedef enum MDBX_error {
/** Successful result */
MDBX_SUCCESS = 0,
@ -2062,11 +1996,7 @@ enum MDBX_error_t {
MDBX_EREMOTE = ENOTBLK,
MDBX_EDEADLK = EDEADLK
#endif /* !Windows */
};
#ifndef __cplusplus
/** \ingroup c_err */
typedef enum MDBX_error_t MDBX_error_t;
#endif
} MDBX_error_t;
/** MDBX_MAP_RESIZED
* \ingroup c_err
@ -2158,7 +2088,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv);
/** \brief MDBX environment extra runtime options.
* \ingroup c_settings
* \see mdbx_env_set_option() \see mdbx_env_get_option() */
enum MDBX_option_t {
typedef enum MDBX_option {
/** \brief Controls the maximum number of named databases for the environment.
*
* \details By default only unnamed key-value database could used and
@ -2323,10 +2253,11 @@ enum MDBX_option_t {
* \details This option controls the in-process threshold of minimum page
* fill, as used space of percentage of a page. Neighbour pages emptier than
* this value are candidates for merging. The threshold value is specified
* in 1/65536 of percent, which is equivalent to the 16-dot-16 fixed point
* format. The specified value must be in the range from 12.5% (almost empty)
* to 50% (half empty) which corresponds to the range from 8192 and to 32768
* in units respectively.
* in 1/65536 points of a whole page, which is equivalent to the 16-dot-16
* fixed point format.
* The specified value must be in the range from 12.5% (almost empty page)
* to 50% (half empty page) which corresponds to the range from 8192 and
* to 32768 in units respectively.
* \see MDBX_opt_prefer_waf_insteadof_balance */
MDBX_opt_merge_threshold_16dot16_percent,
@ -2414,11 +2345,7 @@ enum MDBX_option_t {
*
* \see MDBX_opt_merge_threshold_16dot16_percent */
MDBX_opt_prefer_waf_insteadof_balance
};
#ifndef __cplusplus
/** \ingroup c_settings */
typedef enum MDBX_option_t MDBX_option_t;
#endif
} MDBX_option_t;
/** \brief Sets the value of a extra runtime options for an environment.
* \ingroup c_settings
@ -2533,7 +2460,7 @@ LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
/** \brief Deletion modes for \ref mdbx_env_delete().
* \ingroup c_extra
* \see mdbx_env_delete() */
enum MDBX_env_delete_mode_t {
typedef enum MDBX_env_delete_mode {
/** \brief Just delete the environment's files and directory if any.
* \note On POSIX systems, processes already working with the database will
* continue to work without interference until it close the environment.
@ -2547,11 +2474,7 @@ enum MDBX_env_delete_mode_t {
/** \brief Wait until other processes closes the environment before deletion.
*/
MDBX_ENV_WAIT_FOR_UNUSED = 2,
};
#ifndef __cplusplus
/** \ingroup c_extra */
typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t;
#endif
} MDBX_env_delete_mode_t;
/** \brief Delete the environment's files in a proper and multiprocess-safe way.
* \ingroup c_extra
@ -2662,7 +2585,7 @@ struct MDBX_stat {
uint32_t ms_depth; /**< Depth (height) of the B-tree */
uint64_t ms_branch_pages; /**< Number of internal (non-leaf) pages */
uint64_t ms_leaf_pages; /**< Number of leaf pages */
uint64_t ms_overflow_pages; /**< Number of overflow pages */
uint64_t ms_overflow_pages; /**< Number of large/overflow pages */
uint64_t ms_entries; /**< Number of data items */
uint64_t ms_mod_txnid; /**< Transaction ID of committed last modification */
};
@ -3122,7 +3045,7 @@ LIBMDBX_API int mdbx_env_resurrect_after_fork(MDBX_env *env);
* \ingroup c_settings
* \anchor warmup_flags
* \see mdbx_env_warmup() */
enum MDBX_warmup_flags_t {
typedef enum MDBX_warmup_flags {
/** By default \ref mdbx_env_warmup() just ask OS kernel to asynchronously
* prefetch database pages. */
MDBX_warmup_default = 0,
@ -3165,12 +3088,8 @@ enum MDBX_warmup_flags_t {
/** Release the lock that was performed before by \ref MDBX_warmup_lock. */
MDBX_warmup_release = 16,
};
#ifndef __cplusplus
typedef enum MDBX_warmup_flags_t MDBX_warmup_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t)
#endif
} MDBX_warmup_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags)
/** \brief Warms up the database by loading pages into memory, optionally lock
* ones. \ingroup c_settings
@ -3564,7 +3483,7 @@ MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t
mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags);
/** \brief Returns maximal data size in bytes to fit in a leaf-page or
* single overflow/large-page with the given page size and database flags,
* single large/overflow-page with the given page size and database flags,
* or -1 if pagesize is invalid.
* \ingroup c_statinfo
* \see db_flags */
@ -3740,7 +3659,7 @@ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int
mdbx_env_get_pairsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags);
/** \brief Returns maximal data size in bytes to fit in a leaf-page or
* single overflow/large-page for specified database flags.
* single large/overflow-page for specified database flags.
* \ingroup c_statinfo
*
* \param [in] env An environment handle returned by \ref mdbx_env_create().
@ -4578,7 +4497,7 @@ LIBMDBX_API int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi,
/** \brief DBI state bits returted by \ref mdbx_dbi_flags_ex()
* \ingroup c_statinfo
* \see mdbx_dbi_flags_ex() */
enum MDBX_dbi_state_t {
typedef enum MDBX_dbi_state {
/** DB was written in this txn */
MDBX_DBI_DIRTY = 0x01,
/** Cached Named-DB record is older than txnID */
@ -4587,13 +4506,8 @@ enum MDBX_dbi_state_t {
MDBX_DBI_FRESH = 0x04,
/** Named-DB handle created in this txn */
MDBX_DBI_CREAT = 0x08,
};
#ifndef __cplusplus
/** \ingroup c_statinfo */
typedef enum MDBX_dbi_state_t MDBX_dbi_state_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_dbi_state_t)
#endif
} MDBX_dbi_state_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_dbi_state)
/** \brief Retrieve the DB flags and status for a database handle.
* \ingroup c_statinfo
@ -5005,6 +4919,7 @@ LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor,
* \see mdbx_cursor_renew()
* \see mdbx_cursor_bind()
* \see mdbx_cursor_close()
* \see mdbx_cursor_reset()
*
* \note In contrast to LMDB, the MDBX required that any opened cursors can be
* reused and must be freed explicitly, regardless ones was opened in a
@ -5017,6 +4932,20 @@ LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor,
* \returns A non-zero error value on failure and 0 on success. */
LIBMDBX_API int mdbx_cursor_unbind(MDBX_cursor *cursor);
/** \brief Сбрасывает состояние курсора.
* \ingroup c_cursors
*
* В результате сброса курсор становится неустановленным и не позволяет
* выполнять операции относительного позиционирования, получения или изменения
* данных, до установки на позицию не зависящую от текущей. Что позволяет
* приложению пресекать дальнейшие операции без предварительного
* позиционирования курсора.
*
* \param [in] cursor Указатель на курсор.
*
* \returns Результат операции сканирования, либо код ошибки. */
LIBMDBX_API int mdbx_cursor_reset(MDBX_cursor *cursor);
/** \brief Create a cursor handle for the specified transaction and DBI handle.
* \ingroup c_cursors
*
@ -5197,6 +5126,21 @@ LIBMDBX_API int mdbx_cursor_compare(const MDBX_cursor *left,
LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key,
MDBX_val *data, MDBX_cursor_op op);
/** \brief Служебная функция для использования в утилитах.
* \ingroup c_extra
*
* При использовании определяемых пользователем функций сравнения (aka custom
* comparison functions) проверка порядка ключей может приводить к неверным
* результатам и возврате ошибки \ref MDBX_CORRUPTED.
*
* Эта функция отключает контроль порядка следования ключей на страницах при
* чтении страниц БД для этого курсора, и таким образом, позволяет прочитать
* данные при отсутствии/недоступности использованных функций сравнения.
* \see avoid_custom_comparators
*
* \returns Результат операции сканирования, либо код ошибки. */
LIBMDBX_API int mdbx_cursor_ignord(MDBX_cursor *cursor);
/** \brief Тип предикативных функций обратного вызова используемых
* \ref mdbx_cursor_scan() и \ref mdbx_cursor_scan_from() для пробирования
* пар ключ-значения.
@ -5424,18 +5368,16 @@ LIBMDBX_API int mdbx_cursor_scan_from(MDBX_cursor *cursor,
* \param [in] limit The size of pairs buffer as the number of items,
* but not a pairs.
* \param [in] op A cursor operation \ref MDBX_cursor_op (only
* \ref MDBX_FIRST, \ref MDBX_NEXT, \ref MDBX_GET_CURRENT
* are supported).
* \ref MDBX_FIRST and \ref MDBX_NEXT are supported).
*
* \returns A non-zero error value on failure and 0 on success,
* some possible errors are:
* \retval MDBX_THREAD_MISMATCH Given transaction is not owned
* by current thread.
* \retval MDBX_NOTFOUND No more key-value pairs are available.
* \retval MDBX_NOTFOUND No any key-value pairs are available.
* \retval MDBX_ENODATA The cursor is already at the end of data.
* \retval MDBX_RESULT_TRUE The specified limit is less than the available
* key-value pairs on the current page/position
* that the cursor points to.
* \retval MDBX_RESULT_TRUE The returned chunk is the last one,
* and there are no pairs left.
* \retval MDBX_EINVAL An invalid parameter was specified. */
LIBMDBX_API int mdbx_cursor_get_batch(MDBX_cursor *cursor, size_t *count,
MDBX_val *pairs, size_t limit,
@ -6166,7 +6108,7 @@ LIBMDBX_API int mdbx_preopen_snapinfoW(const wchar_t *pathname,
* \note Данный API еще не зафиксирован, в последующих версиях могут быть
* незначительные доработки и изменения.
* \see mdbx_env_chk() */
enum MDBX_chk_flags_t {
typedef enum MDBX_chk_flags {
/** Режим проверки по-умолчанию, в том числе в режиме только-чтения. */
MDBX_CHK_DEFAULTS = 0,
@ -6184,18 +6126,13 @@ enum MDBX_chk_flags_t {
* \note Требуется при проверке унаследованных БД созданных с использованием
* нестандартных (пользовательских) функций сравнения ключей или значений. */
MDBX_CHK_IGNORE_ORDER = 8
};
#ifndef __cplusplus
/** \ingroup c_opening */
typedef enum MDBX_chk_flags_t MDBX_chk_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags_t)
#endif
} MDBX_chk_flags_t;
DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags)
/** \brief Уровни логирование/детализации информации,
* поставляемой через обратные вызовы при проверке целостности базы данных.
* \see mdbx_env_chk() */
enum MDBX_chk_severity {
typedef enum MDBX_chk_severity {
MDBX_chk_severity_prio_shift = 4,
MDBX_chk_severity_kind_mask = 0xF,
MDBX_chk_fatal = 0x00u,
@ -6209,25 +6146,25 @@ enum MDBX_chk_severity {
MDBX_chk_verbose = 0x78u,
MDBX_chk_details = 0x89u,
MDBX_chk_extra = 0x9Au
};
} MDBX_chk_severity_t;
/** \brief Стадии проверки,
* сообщаемые через обратные вызовы при проверке целостности базы данных.
* \see mdbx_env_chk() */
enum MDBX_chk_stage {
typedef enum MDBX_chk_stage {
MDBX_chk_none,
MDBX_chk_init,
MDBX_chk_lock,
MDBX_chk_meta,
MDBX_chk_traversal_tree,
MDBX_chk_traversal_freedb,
MDBX_chk_tree,
MDBX_chk_gc,
MDBX_chk_space,
MDBX_chk_traversal_maindb,
MDBX_chk_traversal_subdbs,
MDBX_chk_maindb,
MDBX_chk_subdbs,
MDBX_chk_conclude,
MDBX_chk_unlock,
MDBX_chk_finalize
};
} MDBX_chk_stage_t;
/** \brief Виртуальная строка отчета, формируемого при проверке целостности базы
* данных. \see mdbx_env_chk() */
@ -6251,8 +6188,8 @@ typedef struct MDBX_chk_scope {
MDBX_chk_issue_t *issues;
struct MDBX_chk_internal *internal;
const void *object;
enum MDBX_chk_stage stage;
enum MDBX_chk_severity verbosity;
MDBX_chk_stage_t stage;
MDBX_chk_severity_t verbosity;
size_t subtotal_issues;
union {
void *ptr;
@ -6373,11 +6310,11 @@ typedef struct MDBX_chk_callbacks {
size_t entry_number, const MDBX_val *key,
const MDBX_val *value);
int (*stage_begin)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage);
int (*stage_end)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage, int err);
int (*stage_begin)(MDBX_chk_context_t *ctx, MDBX_chk_stage_t);
int (*stage_end)(MDBX_chk_context_t *ctx, MDBX_chk_stage_t, int err);
MDBX_chk_line_t *(*print_begin)(MDBX_chk_context_t *ctx,
enum MDBX_chk_severity severity);
MDBX_chk_severity_t severity);
void (*print_flush)(MDBX_chk_line_t *);
void (*print_done)(MDBX_chk_line_t *);
void (*print_chars)(MDBX_chk_line_t *, const char *str, size_t len);
@ -6417,8 +6354,8 @@ typedef struct MDBX_chk_callbacks {
* \returns Нулевое значение в случае успеха, иначе код ошибки. */
LIBMDBX_API int mdbx_env_chk(MDBX_env *env, const MDBX_chk_callbacks_t *cb,
MDBX_chk_context_t *ctx,
const enum MDBX_chk_flags_t flags,
enum MDBX_chk_severity verbosity,
const MDBX_chk_flags_t flags,
MDBX_chk_severity_t verbosity,
unsigned timeout_seconds_16dot16);
/** \brief Вспомогательная функция для подсчета проблем детектируемых

View File

@ -1,8 +1,8 @@
/// \file mdbx.h++
/// \brief The libmdbx C++ API header file.
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2020-2024
///
/// \author Copyright (c) 2020-2024, Leonid Yuriev <leo@yuriev.ru>.
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \file mdbx.h++
/// \brief The libmdbx C++ API header file.
///
/// Tested with:
/// - Elbrus LCC >= 1.23 (http://www.mcst.ru/lcc);
@ -2329,14 +2329,16 @@ public:
buffer(const char *c_str, bool make_reference,
const allocator_type &allocator = allocator_type())
: buffer(::mdbx::slice(c_str), make_reference, allocator) {}
: buffer(::mdbx::slice(c_str), make_reference, allocator){}
#if defined(DOXYGEN) || \
(defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L)
template <class CHAR, class T>
buffer(const ::std::basic_string_view<CHAR, T> &view, bool make_reference,
const allocator_type &allocator = allocator_type())
: buffer(::mdbx::slice(view), make_reference, allocator) {}
template <class CHAR, class T>
buffer(const ::std::basic_string_view<CHAR, T> &view,
bool make_reference,
const allocator_type &allocator = allocator_type())
: buffer(::mdbx::slice(view), make_reference, allocator) {
}
#endif /* __cpp_lib_string_view >= 201606L */
MDBX_CXX20_CONSTEXPR
@ -2362,15 +2364,16 @@ public:
MDBX_CXX20_CONSTEXPR
buffer(const char *c_str, const allocator_type &allocator = allocator_type())
: buffer(::mdbx::slice(c_str), allocator) {}
: buffer(::mdbx::slice(c_str), allocator){}
#if defined(DOXYGEN) || \
(defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L)
template <class CHAR, class T>
MDBX_CXX20_CONSTEXPR
buffer(const ::std::basic_string_view<CHAR, T> &view,
const allocator_type &allocator = allocator_type())
: buffer(::mdbx::slice(view), allocator) {}
template <class CHAR, class T>
MDBX_CXX20_CONSTEXPR
buffer(const ::std::basic_string_view<CHAR, T> &view,
const allocator_type &allocator = allocator_type())
: buffer(::mdbx::slice(view), allocator) {
}
#endif /* __cpp_lib_string_view >= 201606L */
buffer(size_t head_room, size_t tail_room,
@ -3819,17 +3822,17 @@ public:
static inline size_t pairsize4page_max(const env &, value_mode);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for specified size and database flags.
/// single large/overflow-page for specified size and database flags.
static inline size_t valsize4page_max(intptr_t pagesize,
MDBX_db_flags_t flags);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for specified page size and values mode.
/// single large/overflow-page for specified page size and values mode.
static inline size_t valsize4page_max(intptr_t pagesize, value_mode);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for given environment and database flags.
/// single large/overflow-page for given environment and database flags.
static inline size_t valsize4page_max(const env &, MDBX_db_flags_t flags);
/// \brief Returns maximal data size in bytes to fit in a leaf-page or
/// single overflow/large-page for specified page size and values mode.
/// single large/overflow-page for specified page size and values mode.
static inline size_t valsize4page_max(const env &, value_mode);
/// \brief Returns the maximal write transaction size (i.e. limit for

View File

@ -1,184 +0,0 @@
cmake_minimum_required(VERSION 2.8.7)
set(TARGET mdbx)
project(${TARGET})
set(MDBX_VERSION_MAJOR 0)
set(MDBX_VERSION_MINOR 3)
set(MDBX_VERSION_RELEASE 1)
set(MDBX_VERSION_REVISION 0)
set(MDBX_VERSION_STRING ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE})
enable_language(C)
enable_language(CXX)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED on)
add_definitions(-DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 -D_GNU_SOURCE=1)
find_package(Threads REQUIRED)
get_directory_property(hasParent PARENT_DIRECTORY)
if(hasParent)
set(STANDALONE_BUILD 0)
else()
set(STANDALONE_BUILD 1)
enable_testing()
if (CMAKE_C_COMPILER_ID MATCHES GNU)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wextra")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wpointer-arith")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat-security")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Woverloaded-virtual")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wwrite-strings")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmax-errors=20")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wunused-function -Wunused-variable -Wunused-value -Wmissing-declarations")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wcast-qual")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finline-functions-called-once")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-packed-bitfield-compat")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g3")
endif()
if (COVERAGE)
if (NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
message(FATAL_ERROR "Coverage requires -DCMAKE_BUILD_TYPE=Debug Current value=${CMAKE_BUILD_TYPE}")
endif()
message(STATUS "Setting coverage compiler flags")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ggdb3 -O0 --coverage -fprofile-arcs -ftest-coverage")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -ggdb3 -O0 --coverage -fprofile-arcs -ftest-coverage")
add_definitions(-DCOVERAGE_TEST)
endif()
if (NOT TRAVIS)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=leak -fstack-protector-strong -static-libasan")
endif()
endif()
set(${TARGET}_SRC
mdbx.h
src/bits.h
src/defs.h
src/lck-linux.c
src/mdbx.c
src/osal.c
src/osal.h
src/version.c
)
add_library(${TARGET}_STATIC STATIC
${${TARGET}_SRC}
)
add_library(${TARGET} ALIAS ${TARGET}_STATIC)
add_library(${TARGET}_SHARED SHARED
${${TARGET}_SRC}
)
set_target_properties(${TARGET}_SHARED PROPERTIES
VERSION ${MDBX_VERSION_STRING}
SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}
OUTPUT_NAME ${TARGET}
CLEAN_DIRECT_OUTPUT 1
)
set_target_properties(${TARGET}_STATIC PROPERTIES
VERSION ${MDBX_VERSION_STRING}
SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}
OUTPUT_NAME ${TARGET}
CLEAN_DIRECT_OUTPUT 1
)
target_include_directories(${TARGET}_STATIC PUBLIC
${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(${TARGET}_SHARED PUBLIC
${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${TARGET}_STATIC ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET}_SHARED ${CMAKE_THREAD_LIBS_INIT})
if(UNIX AND NOT APPLE)
target_link_libraries(${TARGET}_STATIC rt)
target_link_libraries(${TARGET}_SHARED rt)
endif()
install(TARGETS ${TARGET}_STATIC DESTINATION ${CMAKE_INSTALL_PREFIX}/lib64 COMPONENT mdbx)
install(TARGETS ${TARGET}_SHARED DESTINATION ${CMAKE_INSTALL_PREFIX}/lib64 COMPONENT mdbx)
install(FILES mdbx.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include COMPONENT mdbx-devel)
add_subdirectory(src/tools)
add_subdirectory(test)
add_subdirectory(test/pcrf)
add_subdirectory(tutorial)
##############################################################################
set(CPACK_GENERATOR "RPM")
set(CPACK_RPM_COMPONENT_INSTALL ON)
# Version
if (NOT "$ENV{BUILD_NUMBER}" STREQUAL "")
set(CPACK_PACKAGE_RELEASE $ENV{BUILD_NUMBER})
else()
if (NOT "$ENV{CI_PIPELINE_ID}" STREQUAL "")
set(CPACK_PACKAGE_RELEASE $ENV{CI_PIPELINE_ID})
else()
set(CPACK_PACKAGE_RELEASE 1)
endif()
endif()
set(CPACK_RPM_PACKAGE_RELEASE ${CPACK_PACKAGE_RELEASE})
set(CPACK_PACKAGE_VERSION ${MDBX_VERSION_STRING})
set(CPACK_PACKAGE_VERSION_FULL ${CPACK_PACKAGE_VERSION}-${CPACK_PACKAGE_RELEASE})
set(CPACK_RPM_mdbx-devel_PACKAGE_REQUIRES "mdbx = ${CPACK_PACKAGE_VERSION}")
set(CPACK_RPM_SPEC_INSTALL_POST "/bin/true")
set(CPACK_RPM_mdbx_PACKAGE_NAME mdbx)
set(CPACK_RPM_mdbx-devel_PACKAGE_NAME mdbx-devel)
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "The revised and extended descendant of Symas LMDB")
set(CPACK_PACKAGE_VENDOR "???")
set(CPACK_PACKAGE_CONTACT "Vladimir Romanov")
set(CPACK_PACKAGE_RELOCATABLE false)
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_PACKAGE_REQUIRES "")
set(CPACK_RPM_PACKAGE_GROUP "Applications/Database")
set(CPACK_RPM_mdbx_FILE_NAME "${CPACK_RPM_mdbx_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_FULL}.${CPACK_RPM_PACKAGE_ARCHITECTURE}.rpm")
set(CPACK_RPM_mdbx-devel_FILE_NAME "${CPACK_RPM_mdbx-devel_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_FULL}.${CPACK_RPM_PACKAGE_ARCHITECTURE}.rpm")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
/usr/local
/usr/local/bin
/usr/local/lib64
/usr/local/include
/usr/local/man
/usr/local/man/man1
)
include(CPack)

View File

@ -1,18 +0,0 @@
#!/bin/bash
set -e
CONFIG=$1
if [[ -z "${CONFIG}" ]]; then
CONFIG=Debug
fi
if [[ -r /opt/rh/devtoolset-6/enable ]]; then
source /opt/rh/devtoolset-6/enable
fi
#rm -f -r build || true
mkdir -p cmake-build-${CONFIG}
pushd cmake-build-${CONFIG} &> /dev/null
if [[ ! -r Makefile ]]; then
cmake .. -DCMAKE_BUILD_TYPE=${CONFIG}
fi
make -j8 || exit 1
popd &> /dev/null

View File

@ -1,25 +0,0 @@
#!/bin/bash
set -e
CONFIG=$1
if [[ -z "${CONFIG}" ]]; then
CONFIG=Debug
fi
DIRNAME=`dirname ${BASH_SOURCE[0]}`
DIRNAME=`readlink --canonicalize ${DIRNAME}`
if [[ -r /opt/rh/devtoolset-6/enable ]]; then
source /opt/rh/devtoolset-6/enable
fi
mkdir -p cmake-build-${CONFIG}
pushd cmake-build-${CONFIG} &> /dev/null
if [[ ! -r Makefile ]]; then
cmake .. -DCMAKE_BUILD_TYPE=${CONFIG}
fi
rm -f *.rpm
make -j8 package || exit 1
rm -f *-Unspecified.rpm
popd &> /dev/null

View File

@ -1,25 +1,52 @@
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#define xMDBX_ALLOY 1 /* alloyed build */
#include "internals.h" /* must be included first */
#include "core.c"
#include "osal.c"
#include "version.c"
#if defined(_WIN32) || defined(_WIN64)
#include "lck-windows.c"
#else
#include "api-cursor.c"
#include "api-env.c"
#include "api-extra.c"
#include "api-key-transform.c"
#include "api-txn.c"
#include "audit.c"
#include "chk.c"
#include "cogs.c"
#include "coherency.c"
#include "cold.c"
#include "copy.c"
#include "cursor.c"
#include "dbi.c"
#include "dpl.c"
#include "dxb.c"
#include "env-opts.c"
#include "env.c"
#include "gc-get.c"
#include "gc-put.c"
#include "global.c"
#include "lck-posix.c"
#endif
#include "lck-windows.c"
#include "lck.c"
#include "logging_and_debug.c"
#include "meta.c"
#include "misc.c"
#include "mvcc-readers.c"
#include "node.c"
#include "osal.c"
#include "page-get.c"
#include "page-iov.c"
#include "page-ops.c"
#include "page-search.c"
#include "pnl.c"
#include "range-estimate.c"
#include "refund.c"
#include "spill.c"
#include "subdb.c"
#include "tls.c"
#include "tree.c"
#include "txl.c"
#include "txn.c"
#include "utils.c"
#include "version.c"
#include "walk.c"
#include "windows-import.c"

797
src/api-cursor.c Normal file
View File

@ -0,0 +1,797 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
MDBX_cursor *mdbx_cursor_create(void *context) {
cursor_couple_t *couple = osal_calloc(1, sizeof(cursor_couple_t));
if (unlikely(!couple))
return nullptr;
VALGRIND_MAKE_MEM_UNDEFINED(couple, sizeof(cursor_couple_t));
couple->outer.signature = cur_signature_ready4dispose;
couple->outer.next = &couple->outer;
couple->userctx = context;
couple->outer.top_and_flags = z_poor_mark;
couple->inner.cursor.top_and_flags = z_poor_mark | z_inner;
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.backup,
sizeof(couple->outer.backup));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.tree, sizeof(couple->outer.tree));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.clc, sizeof(couple->outer.clc));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.dbi_state,
sizeof(couple->outer.dbi_state));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.subcur,
sizeof(couple->outer.subcur));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.txn, sizeof(couple->outer.txn));
return &couple->outer;
}
int mdbx_cursor_renew(const MDBX_txn *txn, MDBX_cursor *mc) {
return likely(mc)
? mdbx_cursor_bind(txn, mc, (kvx_t *)mc->clc - txn->env->kvs)
: MDBX_EINVAL;
}
int mdbx_cursor_reset(MDBX_cursor *mc) {
if (unlikely(!mc))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_ready4dispose &&
mc->signature != cur_signature_live))
return MDBX_EBADSIGN;
cursor_couple_t *couple = (cursor_couple_t *)mc;
couple->outer.top_and_flags = z_poor_mark;
couple->inner.cursor.top_and_flags = z_poor_mark | z_inner;
return MDBX_SUCCESS;
}
int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) {
if (unlikely(!mc))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_ready4dispose &&
mc->signature != cur_signature_live))
return MDBX_EBADSIGN;
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(dbi == FREE_DBI && !(txn->flags & MDBX_TXN_RDONLY)))
return MDBX_EACCESS;
if (unlikely(mc->backup)) /* Cursor from parent transaction */ {
cASSERT(mc, mc->signature == cur_signature_live);
if (unlikely(cursor_dbi(mc) != dbi ||
/* paranoia */ mc->signature != cur_signature_live ||
mc->txn != txn))
return MDBX_EINVAL;
cASSERT(mc, mc->tree == &txn->dbs[dbi]);
cASSERT(mc, mc->clc == &txn->env->kvs[dbi].clc);
cASSERT(mc, cursor_dbi(mc) == dbi);
return likely(cursor_dbi(mc) == dbi &&
/* paranoia */ mc->signature == cur_signature_live &&
mc->txn == txn)
? MDBX_SUCCESS
: MDBX_EINVAL /* Disallow change DBI in nested transactions */;
}
if (mc->signature == cur_signature_live) {
rc = mdbx_cursor_unbind(mc);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
cASSERT(mc, mc->next == mc);
rc = cursor_init(mc, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
mc->next = txn->cursors[dbi];
txn->cursors[dbi] = mc;
return MDBX_SUCCESS;
}
int mdbx_cursor_unbind(MDBX_cursor *mc) {
if (unlikely(!mc))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_SUCCESS
: MDBX_EBADSIGN;
if (unlikely(mc->backup)) /* Cursor from parent transaction */
return MDBX_EINVAL;
eASSERT(nullptr, mc->txn && mc->txn->signature == txn_signature);
cASSERT(mc, mc->signature == cur_signature_live);
cASSERT(mc, !mc->backup);
if (unlikely(!mc->txn || mc->txn->signature != txn_signature)) {
ERROR("Wrong cursor's transaction %p 0x%x",
__Wpedantic_format_voidptr(mc->txn),
mc->txn ? mc->txn->signature : 0);
return MDBX_PROBLEM;
}
if (mc->next != mc) {
const size_t dbi = (kvx_t *)mc->clc - mc->txn->env->kvs;
cASSERT(mc, cursor_dbi(mc) == dbi);
cASSERT(mc, dbi < mc->txn->n_dbi);
if (dbi < mc->txn->n_dbi) {
MDBX_cursor **prev = &mc->txn->cursors[dbi];
while (*prev && *prev != mc)
prev = &(*prev)->next;
cASSERT(mc, *prev == mc);
*prev = mc->next;
}
mc->next = mc;
}
mc->signature = cur_signature_ready4dispose;
mc->flags = 0;
return MDBX_SUCCESS;
}
int mdbx_cursor_open(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) {
if (unlikely(!ret))
return MDBX_EINVAL;
*ret = nullptr;
MDBX_cursor *const mc = mdbx_cursor_create(nullptr);
if (unlikely(!mc))
return MDBX_ENOMEM;
int rc = mdbx_cursor_bind(txn, mc, dbi);
if (unlikely(rc != MDBX_SUCCESS)) {
mdbx_cursor_close(mc);
return rc;
}
*ret = mc;
return MDBX_SUCCESS;
}
void mdbx_cursor_close(MDBX_cursor *mc) {
if (likely(mc)) {
ENSURE(nullptr, mc->signature == cur_signature_live ||
mc->signature == cur_signature_ready4dispose);
MDBX_txn *const txn = mc->txn;
if (!mc->backup) {
mc->txn = nullptr;
/* Unlink from txn, if tracked. */
if (mc->next != mc) {
ENSURE(txn->env, check_txn(txn, 0) == MDBX_SUCCESS);
const size_t dbi = (kvx_t *)mc->clc - txn->env->kvs;
tASSERT(txn, dbi < txn->n_dbi);
if (dbi < txn->n_dbi) {
MDBX_cursor **prev = &txn->cursors[dbi];
while (*prev && *prev != mc)
prev = &(*prev)->next;
tASSERT(txn, *prev == mc);
*prev = mc->next;
}
mc->next = mc;
}
mc->signature = 0;
osal_free(mc);
} else {
/* Cursor closed before nested txn ends */
tASSERT(txn, mc->signature == cur_signature_live);
ENSURE(txn->env, check_txn_rw(txn, 0) == MDBX_SUCCESS);
mc->signature = cur_signature_wait4eot;
}
}
}
int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) {
if (unlikely(!src))
return MDBX_EINVAL;
if (unlikely(src->signature != cur_signature_live))
return (src->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = mdbx_cursor_bind(src->txn, dest, cursor_dbi(src));
if (unlikely(rc != MDBX_SUCCESS))
return rc;
assert(dest->tree == src->tree);
assert(cursor_dbi(dest) == cursor_dbi(src));
again:
assert(dest->clc == src->clc);
assert(dest->txn == src->txn);
dest->top_and_flags = src->top_and_flags;
for (intptr_t i = 0; i <= src->top; ++i) {
dest->ki[i] = src->ki[i];
dest->pg[i] = src->pg[i];
}
if (src->subcur) {
dest->subcur->nested_tree = src->subcur->nested_tree;
src = &src->subcur->cursor;
dest = &dest->subcur->cursor;
goto again;
}
return MDBX_SUCCESS;
}
int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind) {
int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD);
if (likely(rc == MDBX_SUCCESS)) {
TXN_FOREACH_DBI_FROM(txn, i, MAIN_DBI) {
while (txn->cursors[i]) {
MDBX_cursor *mc = txn->cursors[i];
ENSURE(nullptr, mc->signature == cur_signature_live &&
(mc->next != mc) && !mc->backup);
rc = likely(rc < INT_MAX) ? rc + 1 : rc;
txn->cursors[i] = mc->next;
mc->next = mc;
if (unbind) {
mc->signature = cur_signature_ready4dispose;
mc->flags = 0;
} else {
mc->signature = 0;
osal_free(mc);
}
}
}
} else {
eASSERT(nullptr, rc < 0);
}
return rc;
}
int mdbx_cursor_compare(const MDBX_cursor *l, const MDBX_cursor *r,
bool ignore_multival) {
const int incomparable = INT16_MAX + 1;
if (unlikely(!l))
return r ? -incomparable * 9 : 0;
else if (unlikely(!r))
return incomparable * 9;
if (unlikely(l->signature != cur_signature_live))
return (r->signature == cur_signature_live) ? -incomparable * 8 : 0;
if (unlikely(r->signature != cur_signature_live))
return (l->signature == cur_signature_live) ? incomparable * 8 : 0;
if (unlikely(l->clc != r->clc)) {
if (l->txn->env != r->txn->env)
return (l->txn->env > r->txn->env) ? incomparable * 7 : -incomparable * 7;
if (l->txn->txnid != r->txn->txnid)
return (l->txn->txnid > r->txn->txnid) ? incomparable * 6
: -incomparable * 6;
return (l->clc > r->clc) ? incomparable * 5 : -incomparable * 5;
}
assert(cursor_dbi(l) == cursor_dbi(r));
int diff = is_pointed(l) - is_pointed(r);
if (unlikely(diff))
return (diff > 0) ? incomparable * 4 : -incomparable * 4;
if (unlikely(!is_pointed(l)))
return 0;
intptr_t detent = (l->top <= r->top) ? l->top : r->top;
for (intptr_t i = 0; i <= detent; ++i) {
diff = l->ki[i] - r->ki[i];
if (diff)
return diff;
}
if (unlikely(l->top != r->top))
return (l->top > r->top) ? incomparable * 3 : -incomparable * 3;
assert((l->subcur != nullptr) == (r->subcur != nullptr));
if (unlikely((l->subcur != nullptr) != (r->subcur != nullptr)))
return l->subcur ? incomparable * 2 : -incomparable * 2;
if (ignore_multival || !l->subcur)
return 0;
#if MDBX_DEBUG
if (is_pointed(&l->subcur->cursor)) {
const page_t *mp = l->pg[l->top];
const node_t *node = page_node(mp, l->ki[l->top]);
assert(node_flags(node) & N_DUPDATA);
}
if (is_pointed(&r->subcur->cursor)) {
const page_t *mp = r->pg[r->top];
const node_t *node = page_node(mp, r->ki[r->top]);
assert(node_flags(node) & N_DUPDATA);
}
#endif /* MDBX_DEBUG */
l = &l->subcur->cursor;
r = &r->subcur->cursor;
diff = is_pointed(l) - is_pointed(r);
if (unlikely(diff))
return (diff > 0) ? incomparable * 2 : -incomparable * 2;
if (unlikely(!is_pointed(l)))
return 0;
detent = (l->top <= r->top) ? l->top : r->top;
for (intptr_t i = 0; i <= detent; ++i) {
diff = l->ki[i] - r->ki[i];
if (diff)
return diff;
}
if (unlikely(l->top != r->top))
return (l->top > r->top) ? incomparable : -incomparable;
return (l->flags & z_eof_hard) - (r->flags & z_eof_hard);
}
/* Return the count of duplicate data items for the current key */
int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) {
if (unlikely(mc == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(countp == nullptr))
return MDBX_EINVAL;
if ((*countp = is_filled(mc)) > 0) {
if (!inner_hollow(mc)) {
const page_t *mp = mc->pg[mc->top];
const node_t *node = page_node(mp, mc->ki[mc->top]);
cASSERT(mc, node_flags(node) & N_DUPDATA);
*countp = unlikely(mc->subcur->nested_tree.items > PTRDIFF_MAX)
? PTRDIFF_MAX
: (size_t)mc->subcur->nested_tree.items;
}
}
return MDBX_SUCCESS;
}
int mdbx_cursor_on_first(const MDBX_cursor *mc) {
if (unlikely(mc == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
for (intptr_t i = 0; i <= mc->top; ++i) {
if (mc->ki[i])
return MDBX_RESULT_FALSE;
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_on_first_dup(const MDBX_cursor *mc) {
if (unlikely(mc == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
if (is_filled(mc) && mc->subcur) {
mc = &mc->subcur->cursor;
for (intptr_t i = 0; i <= mc->top; ++i) {
if (mc->ki[i])
return MDBX_RESULT_FALSE;
}
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_on_last(const MDBX_cursor *mc) {
if (unlikely(mc == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
for (intptr_t i = 0; i <= mc->top; ++i) {
size_t nkeys = page_numkeys(mc->pg[i]);
if (mc->ki[i] < nkeys - 1)
return MDBX_RESULT_FALSE;
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_on_last_dup(const MDBX_cursor *mc) {
if (unlikely(mc == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
if (is_filled(mc) && mc->subcur) {
mc = &mc->subcur->cursor;
for (intptr_t i = 0; i <= mc->top; ++i) {
size_t nkeys = page_numkeys(mc->pg[i]);
if (mc->ki[i] < nkeys - 1)
return MDBX_RESULT_FALSE;
}
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_eof(const MDBX_cursor *mc) {
if (unlikely(mc == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
return is_eof(mc) ? MDBX_RESULT_TRUE : MDBX_RESULT_FALSE;
}
int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
MDBX_cursor_op op) {
if (unlikely(mc == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(cursor_dbi_changed(mc)))
return MDBX_BAD_DBI;
return cursor_ops(mc, key, data, op);
}
__hot static int scan_confinue(MDBX_cursor *mc, MDBX_predicate_func *predicate,
void *context, void *arg, MDBX_val *key,
MDBX_val *value, MDBX_cursor_op turn_op) {
int rc;
switch (turn_op) {
case MDBX_NEXT:
case MDBX_NEXT_NODUP:
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = outer_next(mc, key, value, turn_op);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
case MDBX_PREV:
case MDBX_PREV_NODUP:
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = outer_prev(mc, key, value, turn_op);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
case MDBX_NEXT_DUP:
if (mc->subcur)
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = inner_next(&mc->subcur->cursor, value);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
return MDBX_NOTFOUND;
case MDBX_PREV_DUP:
if (mc->subcur)
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = inner_prev(&mc->subcur->cursor, value);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
return MDBX_NOTFOUND;
default:
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = cursor_ops(mc, key, value, turn_op);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
}
}
int mdbx_cursor_scan(MDBX_cursor *mc, MDBX_predicate_func *predicate,
void *context, MDBX_cursor_op start_op,
MDBX_cursor_op turn_op, void *arg) {
if (unlikely(!predicate))
return MDBX_EINVAL;
const unsigned valid_start_mask =
1 << MDBX_FIRST | 1 << MDBX_FIRST_DUP | 1 << MDBX_LAST |
1 << MDBX_LAST_DUP | 1 << MDBX_GET_CURRENT | 1 << MDBX_GET_MULTIPLE;
if (unlikely(start_op > 30 || ((1 << start_op) & valid_start_mask) == 0))
return MDBX_EINVAL;
const unsigned valid_turn_mask =
1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP |
1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP |
1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE;
if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0))
return MDBX_EINVAL;
MDBX_val key = {nullptr, 0}, value = {nullptr, 0};
int rc = mdbx_cursor_get(mc, &key, &value, start_op);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
return scan_confinue(mc, predicate, context, arg, &key, &value, turn_op);
}
int mdbx_cursor_scan_from(MDBX_cursor *mc, MDBX_predicate_func *predicate,
void *context, MDBX_cursor_op from_op, MDBX_val *key,
MDBX_val *value, MDBX_cursor_op turn_op, void *arg) {
if (unlikely(!predicate || !key))
return MDBX_EINVAL;
const unsigned valid_start_mask =
1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY |
1 << MDBX_GET_MULTIPLE | 1 << MDBX_SET_LOWERBOUND |
1 << MDBX_SET_UPPERBOUND;
if (unlikely(from_op < MDBX_TO_KEY_LESSER_THAN &&
((1 << from_op) & valid_start_mask) == 0))
return MDBX_EINVAL;
const unsigned valid_turn_mask =
1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP |
1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP |
1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE;
if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0))
return MDBX_EINVAL;
int rc = mdbx_cursor_get(mc, key, value, from_op);
if (unlikely(MDBX_IS_ERROR(rc)))
return rc;
cASSERT(mc, key != nullptr);
MDBX_val stub;
if (!value) {
value = &stub;
rc = cursor_ops(mc, key, value, MDBX_GET_CURRENT);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
return scan_confinue(mc, predicate, context, arg, key, value, turn_op);
}
int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs,
size_t limit, MDBX_cursor_op op) {
if (unlikely(!count))
return MDBX_EINVAL;
*count = 0;
if (unlikely(mc == nullptr || limit < 4 || limit > INTPTR_MAX - 2))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(cursor_dbi_changed(mc)))
return MDBX_BAD_DBI;
if (unlikely(mc->subcur))
return MDBX_INCOMPATIBLE /* must be a non-dupsort subDB */;
switch (op) {
case MDBX_NEXT:
if (unlikely(is_eof(mc)))
return is_pointed(mc) ? MDBX_NOTFOUND : MDBX_ENODATA;
break;
case MDBX_FIRST:
if (!is_filled(mc)) {
rc = outer_first(mc, nullptr, nullptr);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
break;
default:
DEBUG("unhandled/unimplemented cursor operation %u", op);
return MDBX_EINVAL;
}
const page_t *mp = mc->pg[mc->top];
size_t nkeys = page_numkeys(mp);
size_t ki = mc->ki[mc->top];
size_t n = 0;
while (n + 2 <= limit) {
cASSERT(mc, ki < nkeys);
if (unlikely(ki >= nkeys))
goto sibling;
const node_t *leaf = page_node(mp, ki);
pairs[n] = get_key(leaf);
rc = node_read(mc, leaf, &pairs[n + 1], mp);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
n += 2;
if (++ki == nkeys) {
sibling:
rc = cursor_sibling_right(mc);
if (rc != MDBX_SUCCESS) {
if (rc == MDBX_NOTFOUND)
rc = MDBX_RESULT_TRUE;
goto bailout;
}
mp = mc->pg[mc->top];
DEBUG("next page is %" PRIaPGNO ", key index %u", mp->pgno,
mc->ki[mc->top]);
if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) {
ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
mp->pgno, mp->flags);
rc = MDBX_CORRUPTED;
goto bailout;
}
nkeys = page_numkeys(mp);
ki = 0;
}
}
mc->ki[mc->top] = (indx_t)ki;
bailout:
*count = n;
return rc;
}
/*----------------------------------------------------------------------------*/
int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) {
if (unlikely(!mc))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_ready4dispose &&
mc->signature != cur_signature_live))
return MDBX_EBADSIGN;
cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer);
couple->userctx = ctx;
return MDBX_SUCCESS;
}
void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) {
if (unlikely(!mc))
return nullptr;
if (unlikely(mc->signature != cur_signature_ready4dispose &&
mc->signature != cur_signature_live))
return nullptr;
cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer);
return couple->userctx;
}
MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) {
if (unlikely(!mc || mc->signature != cur_signature_live))
return nullptr;
MDBX_txn *txn = mc->txn;
if (unlikely(!txn || txn->signature != txn_signature))
return nullptr;
if (unlikely(txn->flags & MDBX_TXN_FINISHED))
return nullptr;
return txn;
}
MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) {
if (unlikely(!mc || mc->signature != cur_signature_live))
return UINT_MAX;
return cursor_dbi(mc);
}
/*----------------------------------------------------------------------------*/
int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
MDBX_put_flags_t flags) {
if (unlikely(mc == nullptr || key == nullptr || data == nullptr))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = check_txn_rw(mc->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(cursor_dbi_changed(mc)))
return MDBX_BAD_DBI;
cASSERT(mc, cursor_is_tracked(mc));
/* Check this first so counter will always be zero on any early failures. */
if (unlikely(flags & MDBX_MULTIPLE)) {
if (unlikely(flags & MDBX_RESERVE))
return MDBX_EINVAL;
if (unlikely(!(mc->tree->flags & MDBX_DUPFIXED)))
return MDBX_INCOMPATIBLE;
const size_t dcount = data[1].iov_len;
if (unlikely(dcount < 2 || data->iov_len == 0))
return MDBX_BAD_VALSIZE;
if (unlikely(mc->tree->dupfix_size != data->iov_len) &&
mc->tree->dupfix_size)
return MDBX_BAD_VALSIZE;
if (unlikely(dcount >
MAX_MAPSIZE / 2 /
(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) - NODESIZE))) {
/* checking for multiplication overflow */
if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len))
return MDBX_TOO_LARGE;
}
}
if (flags & MDBX_RESERVE) {
if (unlikely(mc->tree->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP |
MDBX_INTEGERDUP | MDBX_DUPFIXED)))
return MDBX_INCOMPATIBLE;
data->iov_base = nullptr;
}
if (unlikely(mc->txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
return (mc->txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
return cursor_put_checklen(mc, key, data, flags);
}
int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
if (unlikely(!mc))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = check_txn_rw(mc->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(cursor_dbi_changed(mc)))
return MDBX_BAD_DBI;
return cursor_del(mc, flags);
}
__cold int mdbx_cursor_ignord(MDBX_cursor *mc) {
if (unlikely(!mc))
return MDBX_EINVAL;
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
mc->checking |= z_ignord;
if (mc->subcur)
mc->subcur->cursor.checking |= z_ignord;
return MDBX_SUCCESS;
}

1399
src/api-env.c Normal file

File diff suppressed because it is too large Load Diff

117
src/api-extra.c Normal file
View File

@ -0,0 +1,117 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
/*------------------------------------------------------------------------------
* Readers API */
__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func,
void *ctx) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!func))
return MDBX_EINVAL;
rc = MDBX_RESULT_TRUE;
int serial = 0;
lck_t *const lck = env->lck_mmap.lck;
if (likely(lck)) {
const size_t snap_nreaders =
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
for (size_t i = 0; i < snap_nreaders; i++) {
const reader_slot_t *r = lck->rdt + i;
retry_reader:;
const uint32_t pid = atomic_load32(&r->pid, mo_AcquireRelease);
if (!pid)
continue;
txnid_t txnid = safe64_read(&r->txnid);
const uint64_t tid = atomic_load64(&r->tid, mo_Relaxed);
const pgno_t pages_used =
atomic_load32(&r->snapshot_pages_used, mo_Relaxed);
const uint64_t reader_pages_retired =
atomic_load64(&r->snapshot_pages_retired, mo_Relaxed);
if (unlikely(txnid != safe64_read(&r->txnid) ||
pid != atomic_load32(&r->pid, mo_AcquireRelease) ||
tid != atomic_load64(&r->tid, mo_Relaxed) ||
pages_used !=
atomic_load32(&r->snapshot_pages_used, mo_Relaxed) ||
reader_pages_retired !=
atomic_load64(&r->snapshot_pages_retired, mo_Relaxed)))
goto retry_reader;
eASSERT(env, txnid > 0);
if (txnid >= SAFE64_INVALID_THRESHOLD)
txnid = 0;
size_t bytes_used = 0;
size_t bytes_retained = 0;
uint64_t lag = 0;
if (txnid) {
troika_t troika = meta_tap(env);
retry_header:;
const meta_ptr_t head = meta_recent(env, &troika);
const uint64_t head_pages_retired =
unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired);
if (unlikely(meta_should_retry(env, &troika) ||
head_pages_retired != unaligned_peek_u64_volatile(
4, head.ptr_v->pages_retired)))
goto retry_header;
lag = (head.txnid - txnid) / xMDBX_TXNID_STEP;
bytes_used = pgno2bytes(env, pages_used);
bytes_retained = (head_pages_retired > reader_pages_retired)
? pgno2bytes(env, (pgno_t)(head_pages_retired -
reader_pages_retired))
: 0;
}
rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)((intptr_t)tid),
txnid, lag, bytes_used, bytes_retained);
if (unlikely(rc != MDBX_SUCCESS))
break;
}
}
return rc;
}
__cold int mdbx_reader_check(MDBX_env *env, int *dead) {
if (dead)
*dead = 0;
return mvcc_cleanup_dead(env, false, dead);
}
/*------------------------------------------------------------------------------
* Locking API */
int mdbx_txn_lock(MDBX_env *env, bool dont_wait) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
if (unlikely(env->basal_txn->owner ||
(env->basal_txn->flags & MDBX_TXN_FINISHED) == 0))
return MDBX_BUSY;
return lck_txn_lock(env, dont_wait);
}
int mdbx_txn_unlock(MDBX_env *env) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
if (unlikely(env->basal_txn->owner != osal_thread_self()))
return MDBX_THREAD_MISMATCH;
if (unlikely((env->basal_txn->flags & MDBX_TXN_FINISHED) == 0))
return MDBX_BUSY;
lck_txn_unlock(env);
return MDBX_SUCCESS;
}

225
src/api-key-transform.c Normal file
View File

@ -0,0 +1,225 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
static inline double key2double(const int64_t key) {
union {
uint64_t u;
double f;
} casting;
casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000)
: UINT64_C(0xffffFFFFffffFFFF) - key;
return casting.f;
}
static inline uint64_t double2key(const double *const ptr) {
STATIC_ASSERT(sizeof(double) == sizeof(int64_t));
const int64_t i = *(const int64_t *)ptr;
const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i
: i + UINT64_C(0x8000000000000000);
if (ASSERT_ENABLED()) {
const double f = key2double(u);
assert(memcmp(&f, ptr, sizeof(double)) == 0);
}
return u;
}
static inline float key2float(const int32_t key) {
union {
uint32_t u;
float f;
} casting;
casting.u =
(key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key;
return casting.f;
}
static inline uint32_t float2key(const float *const ptr) {
STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
const int32_t i = *(const int32_t *)ptr;
const uint32_t u =
(i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000);
if (ASSERT_ENABLED()) {
const float f = key2float(u);
assert(memcmp(&f, ptr, sizeof(float)) == 0);
}
return u;
}
uint64_t mdbx_key_from_double(const double ieee754_64bit) {
return double2key(&ieee754_64bit);
}
uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) {
return double2key(ieee754_64bit);
}
uint32_t mdbx_key_from_float(const float ieee754_32bit) {
return float2key(&ieee754_32bit);
}
uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) {
return float2key(ieee754_32bit);
}
#define IEEE754_DOUBLE_MANTISSA_SIZE 52
#define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF
#define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF
#define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000)
#define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF)
#define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF)
static inline int clz64(uint64_t value) {
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl)
if (sizeof(value) == sizeof(int))
return __builtin_clz(value);
if (sizeof(value) == sizeof(long))
return __builtin_clzl(value);
#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \
__has_builtin(__builtin_clzll)
return __builtin_clzll(value);
#endif /* have(long long) && long long == uint64_t */
#endif /* GNU C */
#if defined(_MSC_VER)
unsigned long index;
#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
_BitScanReverse64(&index, value);
return 63 - index;
#else
if (value > UINT32_MAX) {
_BitScanReverse(&index, (uint32_t)(value >> 32));
return 31 - index;
}
_BitScanReverse(&index, (uint32_t)value);
return 63 - index;
#endif
#endif /* MSVC */
value |= value >> 1;
value |= value >> 2;
value |= value >> 4;
value |= value >> 8;
value |= value >> 16;
value |= value >> 32;
static const uint8_t debruijn_clz64[64] = {
63, 16, 62, 7, 15, 36, 61, 3, 6, 14, 22, 26, 35, 47, 60, 2,
9, 5, 28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1,
17, 8, 37, 4, 23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18,
38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0};
return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58];
}
static inline uint64_t round_mantissa(const uint64_t u64, int shift) {
assert(shift < 0 && u64 > 0);
shift = -shift;
const unsigned half = 1 << (shift - 1);
const unsigned lsb = 1 & (unsigned)(u64 >> shift);
const unsigned tie2even = 1 ^ lsb;
return (u64 + half - tie2even) >> shift;
}
uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) {
const uint64_t bias = UINT64_C(0x8000000000000000);
if (json_integer > 0) {
const uint64_t u64 = json_integer;
int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
uint64_t mantissa = u64 << shift;
if (unlikely(shift < 0)) {
mantissa = round_mantissa(u64, shift);
if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
mantissa = round_mantissa(u64, --shift);
}
assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS +
IEEE754_DOUBLE_MANTISSA_SIZE - shift;
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) +
(mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
#if !defined(_MSC_VER) || \
defined( \
_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
symbol __except1 referenced in function __ftol3_except */
assert(key == mdbx_key_from_double((double)json_integer));
#endif /* Workaround for MSVC */
return key;
}
if (json_integer < 0) {
const uint64_t u64 = -json_integer;
int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
uint64_t mantissa = u64 << shift;
if (unlikely(shift < 0)) {
mantissa = round_mantissa(u64, shift);
if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
mantissa = round_mantissa(u64, --shift);
}
assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS +
IEEE754_DOUBLE_MANTISSA_SIZE - shift;
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) -
(mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
#if !defined(_MSC_VER) || \
defined( \
_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
symbol __except1 referenced in function __ftol3_except */
assert(key == mdbx_key_from_double((double)json_integer));
#endif /* Workaround for MSVC */
return key;
}
return bias;
}
int64_t mdbx_jsonInteger_from_key(const MDBX_val v) {
assert(v.iov_len == 8);
const uint64_t key = unaligned_peek_u64(2, v.iov_base);
const uint64_t bias = UINT64_C(0x8000000000000000);
const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1;
const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 -
(IEEE754_DOUBLE_EXPONENTA_MAX &
(int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE));
if (unlikely(shift < 1))
return (key < bias) ? INT64_MIN : INT64_MAX;
if (unlikely(shift > 63))
return 0;
const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK)
<< (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) +
bias;
const int64_t absolute = unscaled >> shift;
const int64_t value = (key < bias) ? -absolute : absolute;
assert(key == mdbx_key_from_jsonInteger(value) ||
(mdbx_key_from_jsonInteger(value - 1) < key &&
key < mdbx_key_from_jsonInteger(value + 1)));
return value;
}
double mdbx_double_from_key(const MDBX_val v) {
assert(v.iov_len == 8);
return key2double(unaligned_peek_u64(2, v.iov_base));
}
float mdbx_float_from_key(const MDBX_val v) {
assert(v.iov_len == 4);
return key2float(unaligned_peek_u32(2, v.iov_base));
}
int32_t mdbx_int32_from_key(const MDBX_val v) {
assert(v.iov_len == 4);
return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000));
}
int64_t mdbx_int64_from_key(const MDBX_val v) {
assert(v.iov_len == 8);
return (int64_t)(unaligned_peek_u64(2, v.iov_base) -
UINT64_C(0x8000000000000000));
}

508
src/api-txn.c Normal file
View File

@ -0,0 +1,508 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
#ifdef __SANITIZE_THREAD__
/* LY: avoid tsan-trap by txn, mm_last_pg and geo.first_unallocated */
__attribute__((__no_sanitize_thread__, __noinline__))
#endif
int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
{
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return (rc > 0) ? -rc : rc;
MDBX_env *env = txn->env;
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) {
if (percent)
*percent = (int)((txn->geo.first_unallocated * UINT64_C(100) +
txn->geo.end_pgno / 2) /
txn->geo.end_pgno);
return 0;
}
txnid_t lag;
troika_t troika = meta_tap(env);
do {
const meta_ptr_t head = meta_recent(env, &troika);
if (percent) {
const pgno_t maxpg = head.ptr_v->geometry.now;
*percent = (int)((head.ptr_v->geometry.first_unallocated * UINT64_C(100) +
maxpg / 2) /
maxpg);
}
lag = (head.txnid - txn->txnid) / xMDBX_TXNID_STEP;
} while (unlikely(meta_should_retry(env, &troika)));
return (lag > INT_MAX) ? INT_MAX : (int)lag;
}
__cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi,
uint32_t *mask) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!mask))
return MDBX_EINVAL;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if ((cx.outer.tree->flags & MDBX_DUPSORT) == 0)
return MDBX_RESULT_TRUE;
MDBX_val key, data;
rc = outer_first(&cx.outer, &key, &data);
*mask = 0;
while (rc == MDBX_SUCCESS) {
const node_t *node =
page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
const tree_t *db = node_data(node);
const unsigned flags = node_flags(node);
switch (flags) {
case N_BIGDATA:
case 0:
/* single-value entry, deep = 0 */
*mask |= 1 << 0;
break;
case N_DUPDATA:
/* single sub-page, deep = 1 */
*mask |= 1 << 1;
break;
case N_DUPDATA | N_SUBDATA:
/* sub-tree */
*mask |= 1 << UNALIGNED_PEEK_16(db, tree_t, height);
break;
default:
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid node-size", flags);
return MDBX_CORRUPTED;
}
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP);
}
return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
}
int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(canary == nullptr))
return MDBX_EINVAL;
*canary = txn->canary;
return MDBX_SUCCESS;
}
int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
MDBX_val *data) {
DKBUF_DEBUG;
DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!key || !data))
return MDBX_EINVAL;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
return cursor_seek(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err;
}
int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key,
MDBX_val *data) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!key || !data))
return MDBX_EINVAL;
if (unlikely(txn->flags & MDBX_TXN_BLOCKED))
return MDBX_BAD_TXN;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
return cursor_ops(&cx.outer, key, data, MDBX_SET_LOWERBOUND);
}
int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key,
MDBX_val *data, size_t *values_count) {
DKBUF_DEBUG;
DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!key || !data))
return MDBX_EINVAL;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = cursor_seek(&cx.outer, key, data, MDBX_SET_KEY).err;
if (unlikely(rc != MDBX_SUCCESS)) {
if (values_count)
*values_count = 0;
return rc;
}
if (values_count) {
*values_count = 1;
if (inner_pointed(&cx.outer))
*values_count =
(sizeof(*values_count) >= sizeof(cx.inner.nested_tree.items) ||
cx.inner.nested_tree.items <= PTRDIFF_MAX)
? (size_t)cx.inner.nested_tree.items
: PTRDIFF_MAX;
}
return MDBX_SUCCESS;
}
/*----------------------------------------------------------------------------*/
int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (likely(canary)) {
if (txn->canary.x == canary->x && txn->canary.y == canary->y &&
txn->canary.z == canary->z)
return MDBX_SUCCESS;
txn->canary.x = canary->x;
txn->canary.y = canary->y;
txn->canary.z = canary->z;
}
txn->canary.v = txn->txnid;
txn->flags |= MDBX_TXN_DIRTY;
return MDBX_SUCCESS;
}
/* Функция сообщает находится ли указанный адрес в "грязной" странице у
* заданной пишущей транзакции. В конечном счете это позволяет избавиться от
* лишнего копирования данных из НЕ-грязных страниц.
*
* "Грязные" страницы - это те, которые уже были изменены в ходе пишущей
* транзакции. Соответственно, какие-либо дальнейшие изменения могут привести
* к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в
* качестве аргументов НЕ должны получать указатели на данные в таких
* страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут
* скопированы.
*
* Другими словами, данные из "грязных" страниц должны быть либо скопированы
* перед передачей в качестве аргументов для дальнейших модификаций, либо
* отвергнуты на стадии проверки корректности аргументов.
*
* Таким образом, функция позволяет как избавится от лишнего копирования,
* так и выполнить более полную проверку аргументов.
*
* ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только
* так гарантируется что актуальный заголовок страницы будет физически
* расположен в той-же странице памяти, в том числе для многостраничных
* P_LARGE страниц с длинными данными. */
int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
const MDBX_env *env = txn->env;
const ptrdiff_t offset = ptr_dist(ptr, env->dxb_mmap.base);
if (offset >= 0) {
const pgno_t pgno = bytes2pgno(env, offset);
if (likely(pgno < txn->geo.first_unallocated)) {
const page_t *page = pgno2page(env, pgno);
if (unlikely(page->pgno != pgno || (page->flags & P_ILL_BITS) != 0)) {
/* The ptr pointed into middle of a large page,
* not to the beginning of a data. */
return MDBX_EINVAL;
}
return ((txn->flags & MDBX_TXN_RDONLY) || !is_modifable(txn, page))
? MDBX_RESULT_FALSE
: MDBX_RESULT_TRUE;
}
if ((size_t)offset < env->dxb_mmap.limit) {
/* Указатель адресует что-то в пределах mmap, но за границей
* распределенных страниц. Такое может случится если mdbx_is_dirty()
* вызывается после операции, в ходе которой грязная страница была
* возвращена в нераспределенное пространство. */
return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE;
}
}
/* Страница вне используемого mmap-диапазона, т.е. либо в функцию был
* передан некорректный адрес, либо адрес в теневой странице, которая была
* выделена посредством malloc().
*
* Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная",
* а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */
return (txn->flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL
: MDBX_RESULT_TRUE;
}
int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
const MDBX_val *data) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!key))
return MDBX_EINVAL;
if (unlikely(dbi <= FREE_DBI))
return MDBX_BAD_DBI;
if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
MDBX_val proxy;
MDBX_cursor_op op = MDBX_SET;
unsigned flags = MDBX_ALLDUPS;
if (data) {
proxy = *data;
data = &proxy;
op = MDBX_GET_BOTH;
flags = 0;
}
rc = cursor_seek(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
rc = cursor_del(&cx.outer, flags);
txn->cursors[dbi] = cx.outer.next;
return rc;
}
int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data,
MDBX_put_flags_t flags) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!key || !data))
return MDBX_EINVAL;
if (unlikely(dbi <= FREE_DBI))
return MDBX_BAD_DBI;
if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND |
MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE)))
return MDBX_EINVAL;
if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
/* LY: support for update (explicit overwrite) */
if (flags & MDBX_CURRENT) {
rc = cursor_seek(&cx.outer, (MDBX_val *)key, nullptr, MDBX_SET).err;
if (likely(rc == MDBX_SUCCESS) && (txn->dbs[dbi].flags & MDBX_DUPSORT) &&
(flags & MDBX_ALLDUPS) == 0) {
/* LY: allows update (explicit overwrite) only for unique keys */
node_t *node =
page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
if (node_flags(node) & N_DUPDATA) {
tASSERT(txn, inner_pointed(&cx.outer) &&
cx.outer.subcur->nested_tree.items > 1);
rc = MDBX_EMULTIVAL;
if ((flags & MDBX_NOOVERWRITE) == 0) {
flags -= MDBX_CURRENT;
rc = cursor_del(&cx.outer, MDBX_ALLDUPS);
}
}
}
}
if (likely(rc == MDBX_SUCCESS))
rc = cursor_put_checklen(&cx.outer, key, data, flags);
txn->cursors[dbi] = cx.outer.next;
return rc;
}
//------------------------------------------------------------------------------
/* Позволяет обновить или удалить существующую запись с получением
* в old_data предыдущего значения данных. При этом если new_data равен
* нулю, то выполняется удаление, иначе обновление/вставка.
*
* Текущее значение может находиться в уже измененной (грязной) странице.
* В этом случае страница будет перезаписана при обновлении, а само старое
* значение утрачено. Поэтому исходно в old_data должен быть передан
* дополнительный буфер для копирования старого значения.
* Если переданный буфер слишком мал, то функция вернет -1, установив
* old_data->iov_len в соответствующее значение.
*
* Для не-уникальных ключей также возможен второй сценарий использования,
* когда посредством old_data из записей с одинаковым ключом для
* удаления/обновления выбирается конкретная. Для выбора этого сценария
* во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE.
* Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет
* идентифицировать запрос такого сценария.
*
* Функция может быть замещена соответствующими операциями с курсорами
* после двух доработок (TODO):
* - внешняя аллокация курсоров, в том числе на стеке (без malloc).
* - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE).
*/
int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
MDBX_val *new_data, MDBX_val *old_data,
MDBX_put_flags_t flags, MDBX_preserve_func preserver,
void *preserver_context) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!key || !old_data || old_data == new_data))
return MDBX_EINVAL;
if (unlikely(old_data->iov_base == nullptr && old_data->iov_len))
return MDBX_EINVAL;
if (unlikely(new_data == nullptr &&
(flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT))
return MDBX_EINVAL;
if (unlikely(dbi <= FREE_DBI))
return MDBX_BAD_DBI;
if (unlikely(flags &
~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT)))
return MDBX_EINVAL;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
MDBX_val present_key = *key;
if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) {
/* в old_data значение для выбора конкретного дубликата */
if (unlikely(!(txn->dbs[dbi].flags & MDBX_DUPSORT))) {
rc = MDBX_EINVAL;
goto bailout;
}
/* убираем лишний бит, он был признаком запрошенного режима */
flags -= MDBX_NOOVERWRITE;
rc = cursor_seek(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err;
if (rc != MDBX_SUCCESS)
goto bailout;
} else {
/* в old_data буфер для сохранения предыдущего значения */
if (unlikely(new_data && old_data->iov_base == new_data->iov_base))
return MDBX_EINVAL;
MDBX_val present_data;
rc = cursor_seek(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err;
if (unlikely(rc != MDBX_SUCCESS)) {
old_data->iov_base = nullptr;
old_data->iov_len = 0;
if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT))
goto bailout;
} else if (flags & MDBX_NOOVERWRITE) {
rc = MDBX_KEYEXIST;
*old_data = present_data;
goto bailout;
} else {
page_t *page = cx.outer.pg[cx.outer.top];
if (txn->dbs[dbi].flags & MDBX_DUPSORT) {
if (flags & MDBX_CURRENT) {
/* disallow update/delete for multi-values */
node_t *node = page_node(page, cx.outer.ki[cx.outer.top]);
if (node_flags(node) & N_DUPDATA) {
tASSERT(txn, inner_pointed(&cx.outer) &&
cx.outer.subcur->nested_tree.items > 1);
if (cx.outer.subcur->nested_tree.items > 1) {
rc = MDBX_EMULTIVAL;
goto bailout;
}
}
/* В LMDB флажок MDBX_CURRENT здесь приведет
* к замене данных без учета MDBX_DUPSORT сортировки,
* но здесь это в любом случае допустимо, так как мы
* проверили что для ключа есть только одно значение. */
}
}
if (is_modifable(txn, page)) {
if (new_data && cmp_lenfast(&present_data, new_data) == 0) {
/* если данные совпадают, то ничего делать не надо */
*old_data = *new_data;
goto bailout;
}
rc = preserver ? preserver(preserver_context, old_data,
present_data.iov_base, present_data.iov_len)
: MDBX_SUCCESS;
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
} else {
*old_data = present_data;
}
flags |= MDBX_CURRENT;
}
}
if (likely(new_data))
rc = cursor_put_checklen(&cx.outer, key, new_data, flags);
else
rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS);
bailout:
txn->cursors[dbi] = cx.outer.next;
return rc;
}
static int default_value_preserver(void *context, MDBX_val *target,
const void *src, size_t bytes) {
(void)context;
if (unlikely(target->iov_len < bytes)) {
target->iov_base = nullptr;
target->iov_len = bytes;
return MDBX_RESULT_TRUE;
}
memcpy(target->iov_base, src, target->iov_len = bytes);
return MDBX_SUCCESS;
}
int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
MDBX_val *new_data, MDBX_val *old_data,
MDBX_put_flags_t flags) {
return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags,
default_value_preserver, nullptr);
}

390
src/atomics-ops.h Normal file
View File

@ -0,0 +1,390 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
#ifndef __cplusplus
#ifdef MDBX_HAVE_C11ATOMICS
#define osal_memory_fence(order, write) \
atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order))
#else /* MDBX_HAVE_C11ATOMICS */
#define osal_memory_fence(order, write) \
do { \
osal_compiler_barrier(); \
if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \
: mo_AcquireRelease)) \
osal_memory_barrier(); \
} while (0)
#endif /* MDBX_HAVE_C11ATOMICS */
#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__)
#define atomic_store32(p, value, order) \
({ \
const uint32_t value_to_store = (value); \
atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store, \
mo_c11_store(order)); \
value_to_store; \
})
#define atomic_load32(p, order) \
atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order))
#define atomic_store64(p, value, order) \
({ \
const uint64_t value_to_store = (value); \
atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store, \
mo_c11_store(order)); \
value_to_store; \
})
#define atomic_load64(p, order) \
atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order))
#endif /* LCC && MDBX_HAVE_C11ATOMICS */
#ifndef atomic_store32
MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_store32(mdbx_atomic_uint32_t *p, const uint32_t value,
enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4);
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order));
#else /* MDBX_HAVE_C11ATOMICS */
if (order != mo_Relaxed)
osal_compiler_barrier();
p->weak = value;
osal_memory_fence(order, true);
#endif /* MDBX_HAVE_C11ATOMICS */
return value;
}
#endif /* atomic_store32 */
#ifndef atomic_load32
MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
const volatile mdbx_atomic_uint32_t *p, enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4);
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p)));
return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order));
#else /* MDBX_HAVE_C11ATOMICS */
osal_memory_fence(order, false);
const uint32_t value = p->weak;
if (order != mo_Relaxed)
osal_compiler_barrier();
return value;
#endif /* MDBX_HAVE_C11ATOMICS */
}
#endif /* atomic_load32 */
/*------------------------------------------------------------------------------
* safe read/write volatile 64-bit fields on 32-bit architectures. */
/* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
* #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */
#ifndef xMDBX_TXNID_STEP
#if MDBX_64BIT_CAS
#define xMDBX_TXNID_STEP 1u
#else
#define xMDBX_TXNID_STEP 2u
#endif
#endif /* xMDBX_TXNID_STEP */
#ifndef atomic_store64
MDBX_MAYBE_UNUSED static __always_inline uint64_t
atomic_store64(mdbx_atomic_uint64_t *p, const uint64_t value,
enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8);
#if MDBX_64BIT_ATOMIC
#if __GNUC_PREREQ(11, 0)
STATIC_ASSERT(__alignof__(mdbx_atomic_uint64_t) >= sizeof(uint64_t));
#endif /* GNU C >= 11 */
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order));
#else /* MDBX_HAVE_C11ATOMICS */
if (order != mo_Relaxed)
osal_compiler_barrier();
p->weak = value;
osal_memory_fence(order, true);
#endif /* MDBX_HAVE_C11ATOMICS */
#else /* !MDBX_64BIT_ATOMIC */
osal_compiler_barrier();
atomic_store32(&p->low, (uint32_t)value, mo_Relaxed);
jitter4testing(true);
atomic_store32(&p->high, (uint32_t)(value >> 32), order);
jitter4testing(true);
#endif /* !MDBX_64BIT_ATOMIC */
return value;
}
#endif /* atomic_store64 */
#ifndef atomic_load64
MDBX_MAYBE_UNUSED static
#if MDBX_64BIT_ATOMIC
__always_inline
#endif /* MDBX_64BIT_ATOMIC */
uint64_t
atomic_load64(const volatile mdbx_atomic_uint64_t *p,
enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8);
#if MDBX_64BIT_ATOMIC
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p)));
return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order));
#else /* MDBX_HAVE_C11ATOMICS */
osal_memory_fence(order, false);
const uint64_t value = p->weak;
if (order != mo_Relaxed)
osal_compiler_barrier();
return value;
#endif /* MDBX_HAVE_C11ATOMICS */
#else /* !MDBX_64BIT_ATOMIC */
osal_compiler_barrier();
uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32;
jitter4testing(true);
value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
: mo_AcquireRelease);
jitter4testing(true);
for (;;) {
osal_compiler_barrier();
uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32;
jitter4testing(true);
again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
: mo_AcquireRelease);
jitter4testing(true);
if (likely(value == again))
return value;
value = again;
}
#endif /* !MDBX_64BIT_ATOMIC */
}
#endif /* atomic_load64 */
MDBX_MAYBE_UNUSED static __always_inline void atomic_yield(void) {
#if defined(_WIN32) || defined(_WIN64)
YieldProcessor();
#elif defined(__ia32__) || defined(__e2k__)
__builtin_ia32_pause();
#elif defined(__ia64__)
#if defined(__HP_cc__) || defined(__HP_aCC__)
_Asm_hint(_HINT_PAUSE);
#else
__asm__ __volatile__("hint @pause");
#endif
#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || \
defined(__ARM_ARCH_6K__)
#ifdef __CC_ARM
__yield();
#else
__asm__ __volatile__("yield");
#endif
#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \
__mips_isa_rev >= 2
__asm__ __volatile__("pause");
#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \
defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \
defined(__MWERKS__) || defined(__sgi)
__asm__ __volatile__(".word 0x00000140");
#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
sched_yield();
#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
pthread_yield();
#endif
}
#if MDBX_64BIT_CAS
MDBX_MAYBE_UNUSED static __always_inline bool
atomic_cas64(mdbx_atomic_uint64_t *p, uint64_t c, uint64_t v) {
#ifdef MDBX_HAVE_C11ATOMICS
STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t));
assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v);
#elif defined(__GNUC__) || defined(__clang__)
return __sync_bool_compare_and_swap(&p->weak, c, v);
#elif defined(_MSC_VER)
return c == (uint64_t)_InterlockedCompareExchange64(
(volatile __int64 *)&p->weak, v, c);
#elif defined(__APPLE__)
return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak);
#else
#error FIXME: Unsupported compiler
#endif
}
#endif /* MDBX_64BIT_CAS */
MDBX_MAYBE_UNUSED static __always_inline bool
atomic_cas32(mdbx_atomic_uint32_t *p, uint32_t c, uint32_t v) {
#ifdef MDBX_HAVE_C11ATOMICS
STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v);
#elif defined(__GNUC__) || defined(__clang__)
return __sync_bool_compare_and_swap(&p->weak, c, v);
#elif defined(_MSC_VER)
STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
return c ==
(uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c);
#elif defined(__APPLE__)
return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak);
#else
#error FIXME: Unsupported compiler
#endif
}
MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_add32(mdbx_atomic_uint32_t *p, uint32_t v) {
#ifdef MDBX_HAVE_C11ATOMICS
STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v);
#elif defined(__GNUC__) || defined(__clang__)
return __sync_fetch_and_add(&p->weak, v);
#elif defined(_MSC_VER)
STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v);
#elif defined(__APPLE__)
return OSAtomicAdd32Barrier(v, &p->weak);
#else
#error FIXME: Unsupported compiler
#endif
}
#define atomic_sub32(p, v) atomic_add32(p, 0 - (v))
MDBX_MAYBE_UNUSED static __always_inline uint64_t
safe64_txnid_next(uint64_t txnid) {
txnid += xMDBX_TXNID_STEP;
#if !MDBX_64BIT_CAS
/* avoid overflow of low-part in safe64_reset() */
txnid += (UINT32_MAX == (uint32_t)txnid);
#endif
return txnid;
}
/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */
MDBX_MAYBE_UNUSED static __always_inline void
safe64_reset(mdbx_atomic_uint64_t *p, bool single_writer) {
if (single_writer) {
#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64
atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
#else
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */
} else {
#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC
/* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */
atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
#elif MDBX_64BIT_CAS
/* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
#else
/* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1
* and overflow was preserved in safe64_txnid_next() */
STATIC_ASSERT(xMDBX_TXNID_STEP > 1);
atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */
}
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
jitter4testing(true);
}
MDBX_MAYBE_UNUSED static __always_inline bool
safe64_reset_compare(mdbx_atomic_uint64_t *p, uint64_t compare) {
/* LY: This function is used to reset `txnid` from hsr-handler in case
* the asynchronously cancellation of read transaction. Therefore,
* there may be a collision between the cleanup performed here and
* asynchronous termination and restarting of the read transaction
* in another process/thread. In general we MUST NOT reset the `txnid`
* if a new transaction was started (i.e. if `txnid` was changed). */
#if MDBX_64BIT_CAS
bool rc = atomic_cas64(p, compare, UINT64_MAX);
#else
/* LY: There is no gold ratio here since shared mutex is too costly,
* in such way we must acquire/release it for every update of txnid,
* i.e. twice for each read transaction). */
bool rc = false;
if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare &&
atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) {
if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) !=
(uint32_t)compare))
atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32));
else
rc = true;
}
#endif /* MDBX_64BIT_CAS */
jitter4testing(true);
return rc;
}
MDBX_MAYBE_UNUSED static __always_inline void
safe64_write(mdbx_atomic_uint64_t *p, const uint64_t v) {
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS
atomic_store64(p, v, mo_AcquireRelease);
#else /* MDBX_64BIT_ATOMIC */
osal_compiler_barrier();
/* update low-part but still value >= SAFE64_INVALID_THRESHOLD */
atomic_store32(&p->low, (uint32_t)v, mo_Relaxed);
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
jitter4testing(true);
/* update high-part from SAFE64_INVALID_THRESHOLD to actual value */
atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease);
#endif /* MDBX_64BIT_ATOMIC */
assert(p->weak == v);
jitter4testing(true);
}
MDBX_MAYBE_UNUSED static __always_inline uint64_t
safe64_read(const mdbx_atomic_uint64_t *p) {
jitter4testing(true);
uint64_t v;
do
v = atomic_load64(p, mo_AcquireRelease);
while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak));
return v;
}
#if 0 /* unused for now */
MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) {
#if MDBX_WORDBITS >= 64
return v < SAFE64_INVALID_THRESHOLD;
#else
return (v >> 32) != UINT32_MAX;
#endif /* MDBX_WORDBITS */
}
MDBX_MAYBE_UNUSED static __always_inline bool
safe64_is_valid_ptr(const mdbx_atomic_uint64_t *p) {
#if MDBX_64BIT_ATOMIC
return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD;
#else
return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX;
#endif /* MDBX_64BIT_ATOMIC */
}
#endif /* unused for now */
/* non-atomic write with safety for reading a half-updated value */
MDBX_MAYBE_UNUSED static __always_inline void
safe64_update(mdbx_atomic_uint64_t *p, const uint64_t v) {
#if MDBX_64BIT_ATOMIC
atomic_store64(p, v, mo_Relaxed);
#else
safe64_reset(p, true);
safe64_write(p, v);
#endif /* MDBX_64BIT_ATOMIC */
}
/* non-atomic increment with safety for reading a half-updated value */
MDBX_MAYBE_UNUSED static
#if MDBX_64BIT_ATOMIC
__always_inline
#endif /* MDBX_64BIT_ATOMIC */
void
safe64_inc(mdbx_atomic_uint64_t *p, const uint64_t v) {
assert(v > 0);
safe64_update(p, safe64_read(p) + v);
}
#endif /* !__cplusplus */

99
src/atomics-types.h Normal file
View File

@ -0,0 +1,99 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
#ifndef MDBX_64BIT_ATOMIC
#error "The MDBX_64BIT_ATOMIC must be defined before"
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#error "The MDBX_64BIT_CAS must be defined before"
#endif /* MDBX_64BIT_CAS */
#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>)
#include <cstdatomic>
#define MDBX_HAVE_C11ATOMICS
#elif !defined(__cplusplus) && \
(__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \
!defined(__STDC_NO_ATOMICS__) && \
(__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \
!(defined(__GNUC__) || defined(__clang__)))
#include <stdatomic.h>
#define MDBX_HAVE_C11ATOMICS
#elif defined(__GNUC__) || defined(__clang__)
#elif defined(_MSC_VER)
#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
#pragma warning(disable : 4133) /* 'function': incompatible types - from \
'size_t' to 'LONGLONG' */
#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \
'std::size_t', possible loss of data */
#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \
'long', possible loss of data */
#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
#elif defined(__APPLE__)
#include <libkern/OSAtomic.h>
#else
#error FIXME atomic-ops
#endif
typedef enum mdbx_memory_order {
mo_Relaxed,
mo_AcquireRelease
/* , mo_SequentialConsistency */
} mdbx_memory_order_t;
typedef union {
volatile uint32_t weak;
#ifdef MDBX_HAVE_C11ATOMICS
volatile _Atomic uint32_t c11a;
#endif /* MDBX_HAVE_C11ATOMICS */
} mdbx_atomic_uint32_t;
typedef union {
volatile uint64_t weak;
#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
volatile _Atomic uint64_t c11a;
#endif
#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
__anonymous_struct_extension__ struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
mdbx_atomic_uint32_t low, high;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
mdbx_atomic_uint32_t high, low;
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
};
#endif
} mdbx_atomic_uint64_t;
#ifdef MDBX_HAVE_C11ATOMICS
/* Crutches for C11 atomic compiler's bugs */
#if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127
#define MDBX_c11a_ro(type, ptr) (&(ptr)->weak)
#define MDBX_c11a_rw(type, ptr) (&(ptr)->weak)
#elif defined(__clang__) && __clang__ < 8
#define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a)
#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
#else
#define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a)
#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
#endif /* Crutches for C11 atomic compiler's bugs */
#define mo_c11_store(fence) \
(((fence) == mo_Relaxed) ? memory_order_relaxed \
: ((fence) == mo_AcquireRelease) ? memory_order_release \
: memory_order_seq_cst)
#define mo_c11_load(fence) \
(((fence) == mo_Relaxed) ? memory_order_relaxed \
: ((fence) == mo_AcquireRelease) ? memory_order_acquire \
: memory_order_seq_cst)
#endif /* MDBX_HAVE_C11ATOMICS */
#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)

164
src/audit.c Normal file
View File

@ -0,0 +1,164 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__cold static tree_t *audit_db_dig(const MDBX_txn *txn, const size_t dbi,
tree_t *fallback) {
const MDBX_txn *dig = txn;
do {
tASSERT(txn, txn->n_dbi == dig->n_dbi);
const uint8_t state = dbi_state(dig, dbi);
if (state & DBI_LINDO)
switch (state & (DBI_VALID | DBI_STALE | DBI_OLDEN)) {
case DBI_VALID:
case DBI_OLDEN:
return dig->dbs + dbi;
case 0:
return nullptr;
case DBI_VALID | DBI_STALE:
case DBI_OLDEN | DBI_STALE:
break;
default:
tASSERT(txn, !!"unexpected dig->dbi_state[dbi]");
}
dig = dig->parent;
} while (dig);
return fallback;
}
static size_t audit_db_used(const tree_t *db) {
return db ? (size_t)db->branch_pages + (size_t)db->leaf_pages +
(size_t)db->large_pages
: 0;
}
__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored,
bool dont_filter_gc) {
const MDBX_env *const env = txn->env;
size_t pending = 0;
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) +
(MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored);
cursor_couple_t cx;
int rc = cursor_init(&cx.outer, txn, FREE_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
size_t gc = 0;
MDBX_val key, data;
rc = outer_first(&cx.outer, &key, &data);
while (rc == MDBX_SUCCESS) {
if (!dont_filter_gc) {
if (unlikely(key.iov_len != sizeof(txnid_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid GC-key size", (unsigned)key.iov_len);
return MDBX_CORRUPTED;
}
txnid_t id = unaligned_peek_u64(4, key.iov_base);
if (txn->tw.gc.reclaimed) {
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed); ++i)
if (id == txn->tw.gc.reclaimed[i])
goto skip;
} else if (id <= txn->tw.gc.last_reclaimed)
goto skip;
}
gc += *(pgno_t *)data.iov_base;
skip:
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT);
}
tASSERT(txn, rc == MDBX_NOTFOUND);
const size_t done_bitmap_size = (txn->n_dbi + CHAR_BIT - 1) / CHAR_BIT;
uint8_t *const done_bitmap = alloca(done_bitmap_size);
memset(done_bitmap, 0, done_bitmap_size);
if (txn->parent) {
tASSERT(txn, txn->n_dbi == txn->parent->n_dbi &&
txn->n_dbi == txn->env->txn->n_dbi);
#if MDBX_ENABLE_DBI_SPARSE
tASSERT(txn, txn->dbi_sparse == txn->parent->dbi_sparse &&
txn->dbi_sparse == txn->env->txn->dbi_sparse);
#endif /* MDBX_ENABLE_DBI_SPARSE */
}
size_t used = NUM_METAS +
audit_db_used(audit_db_dig(txn, FREE_DBI, nullptr)) +
audit_db_used(audit_db_dig(txn, MAIN_DBI, nullptr));
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = tree_search(&cx.outer, nullptr, Z_FIRST);
while (rc == MDBX_SUCCESS) {
page_t *mp = cx.outer.pg[cx.outer.top];
for (size_t k = 0; k < page_numkeys(mp); k++) {
node_t *node = page_node(mp, k);
if (node_flags(node) != N_SUBDATA)
continue;
if (unlikely(node_ds(node) != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid dupsort sub-tree node size", (unsigned)node_ds(node));
return MDBX_CORRUPTED;
}
tree_t reside;
const tree_t *db = memcpy(&reside, node_data(node), sizeof(reside));
const MDBX_val name = {node_key(node), node_ks(node)};
for (size_t dbi = CORE_DBS; dbi < env->n_dbi; ++dbi) {
if (dbi >= txn->n_dbi || !(env->dbs_flags[dbi] & DB_VALID))
continue;
if (env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name))
continue;
done_bitmap[dbi / CHAR_BIT] |= 1 << dbi % CHAR_BIT;
db = audit_db_dig(txn, dbi, &reside);
break;
}
used += audit_db_used(db);
}
rc = cursor_sibling_right(&cx.outer);
}
tASSERT(txn, rc == MDBX_NOTFOUND);
for (size_t dbi = CORE_DBS; dbi < txn->n_dbi; ++dbi) {
if (done_bitmap[dbi / CHAR_BIT] & (1 << dbi % CHAR_BIT))
continue;
const tree_t *db = audit_db_dig(txn, dbi, nullptr);
if (db)
used += audit_db_used(db);
else if (dbi_state(txn, dbi))
WARNING("audit %s@%" PRIaTXN
": unable account dbi %zd / \"%*s\", state 0x%02x",
txn->parent ? "nested-" : "", txn->txnid, dbi,
(int)env->kvs[dbi].name.iov_len,
(const char *)env->kvs[dbi].name.iov_base, dbi_state(txn, dbi));
}
if (pending + gc + used == txn->geo.first_unallocated)
return MDBX_SUCCESS;
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + "
"%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)",
txn->txnid, pending, txn->tw.loose_count,
MDBX_PNL_GETSIZE(txn->tw.relist),
txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0,
retired_stored);
ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu"
"(gc) + %zu(count) = %zu(total) <> %zu"
"(allocated)",
txn->txnid, pending, gc, used, pending + gc + used,
(size_t)txn->geo.first_unallocated);
return MDBX_PROBLEM;
}
__cold int audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) {
MDBX_env *const env = txn->env;
int rc = osal_fastmutex_acquire(&env->dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
rc = audit_ex_locked(txn, retired_stored, dont_filter_gc);
ENSURE(txn->env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS);
}
return rc;
}

View File

@ -1,13 +1,13 @@
N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE | MRESIZE |
--|---------|-----------|--------------|----------|-----------|------------|---------|----------|---------|
0 |0000 0001|ALLOC_RSRV |TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH | |
1 |0000 0002|ALLOC_UNIMP|TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF | |
2 |0000 0004|ALLOC_COLSC|TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| |
0 |0000 0001|ALLOC_RSRV |TXN_FINISHED | | |DBI_DIRTY |N_BIGDATA|P_BRANCH | |
1 |0000 0002|ALLOC_UNIMP|TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |N_SUBDATA|P_LEAF | |
2 |0000 0004|ALLOC_COLSC|TXN_DIRTY |DUPSORT | |DBI_FRESH |N_DUPDATA|P_LARGE | |
3 |0000 0008|ALLOC_SSCAN|TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | |
4 |0000 0010|ALLOC_FIFO |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | |
5 |0000 0020| |TXN_DRAINED_GC|INTEGERDUP|NODUPDATA | | |P_LEAF2 | |
5 |0000 0020| |TXN_DRAINED_GC|INTEGERDUP|NODUPDATA | | |P_DUPFIX | |
6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_OLDEN | |P_SUBP | |
7 |0000 0080| | | |ALLDUPS |DBI_LINDO | | | |
7 |0000 0080| | |DB_VALID |ALLDUPS |DBI_LINDO | | | |
8 |0000 0100| _MAY_MOVE | | | | | | | <= |
9 |0000 0200| _MAY_UNMAP| | | | | | | <= |
10|0000 0400| | | | | | | | |
@ -15,7 +15,7 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD
12|0000 1000| | | | | | | | |
13|0000 2000|VALIDATION | | | | | |P_SPILLED | |
14|0000 4000|NOSUBDIR | | | | | |P_LOOSE | |
15|0000 8000| | |DB_VALID | | | |P_FROZEN | |
15|0000 8000| | | | | | |P_FROZEN | |
16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE | | |
17|0002 0000|RDONLY |TXN_RDONLY | |APPEND | |APPEND | | <= |
18|0004 0000|NOMETASYNC |TXN_NOMETASYNC|CREATE |APPENDDUP | | | | |

2097
src/chk.c Normal file

File diff suppressed because it is too large Load Diff

353
src/cogs.c Normal file
View File

@ -0,0 +1,353 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
/*------------------------------------------------------------------------------
* Pack/Unpack 16-bit values for Grow step & Shrink threshold */
MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t me2v(size_t m, size_t e) {
assert(m < 2048 && e < 8);
return (pgno_t)(32768 + ((m + 1) << (e + 8)));
}
MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t v2me(size_t v, size_t e) {
assert(v > (e ? me2v(2047, e - 1) : 32768));
assert(v <= me2v(2047, e));
size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8);
m -= m > 0;
assert(m < 2048 && e < 8);
// f e d c b a 9 8 7 6 5 4 3 2 1 0
// 1 e e e m m m m m m m m m m m 1
const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1));
assert(pv != 65535);
return pv;
}
/* Convert 16-bit packed (exponential quantized) value to number of pages */
pgno_t pv2pages(uint16_t pv) {
if ((pv & 0x8001) != 0x8001)
return pv;
if (pv == 65535)
return 65536;
// f e d c b a 9 8 7 6 5 4 3 2 1 0
// 1 e e e m m m m m m m m m m m 1
return me2v((pv >> 1) & 2047, (pv >> 12) & 7);
}
/* Convert number of pages to 16-bit packed (exponential quantized) value */
uint16_t pages2pv(size_t pages) {
if (pages < 32769 || (pages < 65536 && (pages & 1) == 0))
return (uint16_t)pages;
if (pages <= me2v(2047, 0))
return v2me(pages, 0);
if (pages <= me2v(2047, 1))
return v2me(pages, 1);
if (pages <= me2v(2047, 2))
return v2me(pages, 2);
if (pages <= me2v(2047, 3))
return v2me(pages, 3);
if (pages <= me2v(2047, 4))
return v2me(pages, 4);
if (pages <= me2v(2047, 5))
return v2me(pages, 5);
if (pages <= me2v(2047, 6))
return v2me(pages, 6);
return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533;
}
__cold bool pv2pages_verify(void) {
bool ok = true, dump_translation = false;
for (size_t i = 0; i < 65536; ++i) {
size_t pages = pv2pages(i);
size_t x = pages2pv(pages);
size_t xp = pv2pages(x);
if (pages != xp) {
ERROR("%zu => %zu => %zu => %zu\n", i, pages, x, xp);
ok = false;
} else if (dump_translation && !(x == i || (x % 2 == 0 && x < 65536))) {
DEBUG("%zu => %zu => %zu => %zu\n", i, pages, x, xp);
}
}
return ok;
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION size_t bytes_align2os_bytes(const MDBX_env *env,
size_t bytes) {
return ceil_powerof2(
bytes, (env->ps > globals.sys_pagesize) ? env->ps : globals.sys_pagesize);
}
MDBX_NOTHROW_PURE_FUNCTION size_t pgno_align2os_bytes(const MDBX_env *env,
size_t pgno) {
return ceil_powerof2(pgno2bytes(env, pgno), globals.sys_pagesize);
}
MDBX_NOTHROW_PURE_FUNCTION pgno_t pgno_align2os_pgno(const MDBX_env *env,
size_t pgno) {
return bytes2pgno(env, pgno_align2os_bytes(env, pgno));
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION static __always_inline int
cmp_int_inline(const size_t expected_alignment, const MDBX_val *a,
const MDBX_val *b) {
if (likely(a->iov_len == b->iov_len)) {
if (sizeof(size_t) > 7 && likely(a->iov_len == 8))
return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base),
unaligned_peek_u64(expected_alignment, b->iov_base));
if (likely(a->iov_len == 4))
return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base),
unaligned_peek_u32(expected_alignment, b->iov_base));
if (sizeof(size_t) < 8 && likely(a->iov_len == 8))
return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base),
unaligned_peek_u64(expected_alignment, b->iov_base));
}
ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP",
a->iov_base, a->iov_len, b->iov_base, b->iov_len);
return 0;
}
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_unaligned(const MDBX_val *a,
const MDBX_val *b) {
return cmp_int_inline(1, a, b);
}
#ifndef cmp_int_align2
/* Compare two items pointing at 2-byte aligned unsigned int's. */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align2(const MDBX_val *a,
const MDBX_val *b) {
return cmp_int_inline(2, a, b);
}
#endif /* cmp_int_align2 */
#ifndef cmp_int_align4
/* Compare two items pointing at 4-byte aligned unsigned int's. */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align4(const MDBX_val *a,
const MDBX_val *b) {
return cmp_int_inline(4, a, b);
}
#endif /* cmp_int_align4 */
/* Compare two items lexically */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lexical(const MDBX_val *a,
const MDBX_val *b) {
if (a->iov_len == b->iov_len)
return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0;
const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1;
const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0;
return likely(diff_data) ? diff_data : diff_len;
}
MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
tail3le(const uint8_t *p, size_t l) {
STATIC_ASSERT(sizeof(unsigned) > 2);
// 1: 0 0 0
// 2: 0 1 1
// 3: 0 1 2
return p[0] | p[l >> 1] << 8 | p[l - 1] << 16;
}
/* Compare two items in reverse byte order */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_reverse(const MDBX_val *a,
const MDBX_val *b) {
size_t left = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
if (likely(left)) {
const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len);
const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len);
while (left >= sizeof(size_t)) {
pa -= sizeof(size_t);
pb -= sizeof(size_t);
left -= sizeof(size_t);
STATIC_ASSERT(sizeof(size_t) == 4 || sizeof(size_t) == 8);
if (sizeof(size_t) == 4) {
uint32_t xa = unaligned_peek_u32(1, pa);
uint32_t xb = unaligned_peek_u32(1, pb);
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
xa = osal_bswap32(xa);
xb = osal_bswap32(xb);
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
if (xa != xb)
return (xa < xb) ? -1 : 1;
} else {
uint64_t xa = unaligned_peek_u64(1, pa);
uint64_t xb = unaligned_peek_u64(1, pb);
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
xa = osal_bswap64(xa);
xb = osal_bswap64(xb);
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
if (xa != xb)
return (xa < xb) ? -1 : 1;
}
}
if (sizeof(size_t) == 8 && left >= 4) {
pa -= 4;
pb -= 4;
left -= 4;
uint32_t xa = unaligned_peek_u32(1, pa);
uint32_t xb = unaligned_peek_u32(1, pb);
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
xa = osal_bswap32(xa);
xb = osal_bswap32(xb);
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
if (xa != xb)
return (xa < xb) ? -1 : 1;
}
if (left) {
unsigned xa = tail3le(pa - left, left);
unsigned xb = tail3le(pb - left, left);
if (xa != xb)
return (xa < xb) ? -1 : 1;
}
}
return CMP2INT(a->iov_len, b->iov_len);
}
/* Fast non-lexically comparator */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lenfast(const MDBX_val *a,
const MDBX_val *b) {
int diff = CMP2INT(a->iov_len, b->iov_len);
return (likely(diff) || a->iov_len == 0)
? diff
: memcmp(a->iov_base, b->iov_base, a->iov_len);
}
MDBX_NOTHROW_PURE_FUNCTION __hot bool
eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l) {
if (likely(l > 3)) {
if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9))
return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) |
(unaligned_peek_u32(1, a + l - 4) -
unaligned_peek_u32(1, b + l - 4))) == 0;
if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17))
return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) |
(unaligned_peek_u64(1, a + l - 8) -
unaligned_peek_u64(1, b + l - 8))) == 0;
return memcmp(a, b, l) == 0;
}
if (likely(l))
return tail3le(a, l) == tail3le(b, l);
return true;
}
int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b) {
return eq_fast(a, b) ? 0 : 1;
}
int cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b) {
return eq_fast(a, b) ? 0 : -1;
}
/*----------------------------------------------------------------------------*/
__cold void update_mlcnt(const MDBX_env *env,
const pgno_t new_aligned_mlocked_pgno,
const bool lock_not_release) {
for (;;) {
const pgno_t mlock_pgno_before =
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease);
eASSERT(env,
pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before);
eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) ==
new_aligned_mlocked_pgno);
if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno)
: (mlock_pgno_before <= new_aligned_mlocked_pgno))
break;
if (likely(atomic_cas32(&((MDBX_env *)env)->mlocked_pgno, mlock_pgno_before,
new_aligned_mlocked_pgno)))
for (;;) {
mdbx_atomic_uint32_t *const mlcnt = env->lck->mlcnt;
const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed);
const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed);
if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) {
eASSERT(env, lock_not_release);
if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1)))
continue;
}
if (new_aligned_mlocked_pgno == 0 &&
(snap_locked - snap_unlocked) > 0) {
eASSERT(env, !lock_not_release);
if (unlikely(
!atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1)))
continue;
}
NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u",
lock_not_release ? "lock" : "unlock",
lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno,
lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before,
snap_locked - snap_unlocked,
atomic_load32(mlcnt + 0, mo_Relaxed) -
atomic_load32(mlcnt + 1, mo_Relaxed));
return;
}
}
}
__cold void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno,
const size_t end_bytes) {
if (atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) > aligned_pgno) {
int err = MDBX_ENOSYS;
const size_t munlock_begin = pgno2bytes(env, aligned_pgno);
const size_t munlock_size = end_bytes - munlock_begin;
eASSERT(env, end_bytes % globals.sys_pagesize == 0 &&
munlock_begin % globals.sys_pagesize == 0 &&
munlock_size % globals.sys_pagesize == 0);
#if defined(_WIN32) || defined(_WIN64)
err =
VirtualUnlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size)
? MDBX_SUCCESS
: (int)GetLastError();
if (err == ERROR_NOT_LOCKED)
err = MDBX_SUCCESS;
#elif defined(_POSIX_MEMLOCK_RANGE)
err = munlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size)
? errno
: MDBX_SUCCESS;
#endif
if (likely(err == MDBX_SUCCESS))
update_mlcnt(env, aligned_pgno, false);
else {
#if defined(_WIN32) || defined(_WIN64)
WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size,
err);
#else
WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err);
#endif
}
}
}
__cold void munlock_all(const MDBX_env *env) {
munlock_after(env, 0, bytes_align2os_bytes(env, env->dxb_mmap.current));
}
/*----------------------------------------------------------------------------*/
uint32_t combine_durability_flags(const uint32_t a, const uint32_t b) {
uint32_t r = a | b;
/* avoid false MDBX_UTTERLY_NOSYNC */
if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
!F_ISSET(b, MDBX_UTTERLY_NOSYNC))
r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC;
/* convert DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */
if ((r & (MDBX_WRITEMAP | DEPRECATED_MAPASYNC)) ==
(MDBX_WRITEMAP | DEPRECATED_MAPASYNC) &&
!F_ISSET(r, MDBX_UTTERLY_NOSYNC))
r = (r - DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC;
/* force MDBX_NOMETASYNC if NOSYNC enabled */
if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC))
r |= MDBX_NOMETASYNC;
assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) &&
!F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
!F_ISSET(b, MDBX_UTTERLY_NOSYNC)));
return r;
}

558
src/cogs.h Normal file
View File

@ -0,0 +1,558 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL pgno_t pv2pages(uint16_t pv);
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint16_t pages2pv(size_t pages);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool pv2pages_verify(void);
/*------------------------------------------------------------------------------
* Nodes, Keys & Values length limitation factors:
*
* BRANCH_NODE_MAX
* Branch-page must contain at least two nodes, within each a key and a child
* page number. But page can't be split if it contains less that 4 keys,
* i.e. a page should not overflow before adding the fourth key. Therefore,
* at least 3 branch-node should fit in the single branch-page. Further, the
* first node of a branch-page doesn't contain a key, i.e. the first node
* is always require space just for itself. Thus:
* PAGESPACE = pagesize - page_hdr_len;
* BRANCH_NODE_MAX = even_floor(
* (PAGESPACE - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t));
* KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len;
*
* LEAF_NODE_MAX
* Leaf-node must fit into single leaf-page, where a value could be placed on
* a large/overflow page. However, may require to insert a nearly page-sized
* node between two large nodes are already fill-up a page. In this case the
* page must be split to two if some pair of nodes fits on one page, or
* otherwise the page should be split to the THREE with a single node
* per each of ones. Such 1-into-3 page splitting is costly and complex since
* requires TWO insertion into the parent page, that could lead to split it
* and so on up to the root. Therefore double-splitting is avoided here and
* the maximum node size is half of a leaf page space:
* LEAF_NODE_MAX = even_floor(PAGESPACE / 2 - sizeof(indx_t));
* DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - NODESIZE - KEYLEN_MAX;
*
* - SubDatabase-node must fit into one leaf-page:
* SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(tree_t);
*
* - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer
* than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX,
* since dupsort value couldn't be placed on a large/overflow page:
* DUPSORT_DATALEN_MAX = min(KEYLEN_MAX,
* max(DATALEN_NO_OVERFLOW, sizeof(tree_t));
*/
#define PAGESPACE(pagesize) ((pagesize) - PAGEHDRSZ)
#define BRANCH_NODE_MAX(pagesize) \
(EVEN_FLOOR((PAGESPACE(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) - \
sizeof(indx_t)))
#define LEAF_NODE_MAX(pagesize) \
(EVEN_FLOOR(PAGESPACE(pagesize) / 2) - sizeof(indx_t))
#define MAX_GC1OVPAGE(pagesize) (PAGESPACE(pagesize) / sizeof(pgno_t) - 1)
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
keysize_max(size_t pagesize, MDBX_db_flags_t flags) {
assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE &&
is_powerof2(pagesize));
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE >= 8);
if (flags & MDBX_INTEGERKEY)
return 8 /* sizeof(uint64_t) */;
const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE;
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE -
/* sizeof(uint64) as a key */ 8 >
sizeof(tree_t));
if (flags &
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
const intptr_t max_dupsort_leaf_key =
LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(tree_t);
return (max_branch_key < max_dupsort_leaf_key) ? max_branch_key
: max_dupsort_leaf_key;
}
return max_branch_key;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
env_keysize_max(const MDBX_env *env, MDBX_db_flags_t flags) {
size_t size_max;
if (flags & MDBX_INTEGERKEY)
size_max = 8 /* sizeof(uint64_t) */;
else {
const intptr_t max_branch_key = env->branch_nodemax - NODESIZE;
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE -
/* sizeof(uint64) as a key */ 8 >
sizeof(tree_t));
if (flags &
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
const intptr_t max_dupsort_leaf_key =
env->leaf_nodemax - NODESIZE - sizeof(tree_t);
size_max = (max_branch_key < max_dupsort_leaf_key) ? max_branch_key
: max_dupsort_leaf_key;
} else
size_max = max_branch_key;
}
eASSERT(env, size_max == keysize_max(env->ps, flags));
return size_max;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
keysize_min(MDBX_db_flags_t flags) {
return (flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
valsize_min(MDBX_db_flags_t flags) {
if (flags & MDBX_INTEGERDUP)
return 4 /* sizeof(uint32_t) */;
else if (flags & MDBX_DUPFIXED)
return sizeof(indx_t);
else
return 0;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
valsize_max(size_t pagesize, MDBX_db_flags_t flags) {
assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE &&
is_powerof2(pagesize));
if (flags & MDBX_INTEGERDUP)
return 8 /* sizeof(uint64_t) */;
if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
return keysize_max(pagesize, 0);
const unsigned page_ln2 = log2n_powerof2(pagesize);
const size_t hard = 0x7FF00000ul;
const size_t hard_pages = hard >> page_ln2;
STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO);
const size_t pages_limit = PAGELIST_LIMIT / 4;
const size_t limit =
(hard_pages < pages_limit) ? hard : (pages_limit << page_ln2);
return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
env_valsize_max(const MDBX_env *env, MDBX_db_flags_t flags) {
size_t size_max;
if (flags & MDBX_INTEGERDUP)
size_max = 8 /* sizeof(uint64_t) */;
else if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
size_max = env_keysize_max(env, 0);
else {
const size_t hard = 0x7FF00000ul;
const size_t hard_pages = hard >> env->ps2ln;
STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO);
const size_t pages_limit = PAGELIST_LIMIT / 4;
const size_t limit =
(hard_pages < pages_limit) ? hard : (pages_limit << env->ps2ln);
size_max = (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
}
eASSERT(env, size_max == valsize_max(env->ps, flags));
return size_max;
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) {
size_t node_bytes = node_size(key, data);
if (node_bytes > env->leaf_nodemax)
/* put on large/overflow page */
node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t);
return node_bytes + sizeof(indx_t);
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
branch_size(const MDBX_env *env, const MDBX_val *key) {
/* Size of a node in a branch page with a given key.
* This is just the node header plus the key, there is no data. */
size_t node_bytes = node_size(key, nullptr);
if (unlikely(node_bytes > env->branch_nodemax)) {
/* put on large/overflow page, not implemented */
mdbx_panic("node_size(key) %zu > %u branch_nodemax", node_bytes,
env->branch_nodemax);
node_bytes = node_size(key, nullptr) + sizeof(pgno_t);
}
return node_bytes + sizeof(indx_t);
}
MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t
flags_db2sub(uint16_t db_flags) {
uint16_t sub_flags = db_flags & MDBX_DUPFIXED;
/* MDBX_INTEGERDUP => MDBX_INTEGERKEY */
#define SHIFT_INTEGERDUP_TO_INTEGERKEY 2
STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) ==
MDBX_INTEGERKEY);
sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY;
/* MDBX_REVERSEDUP => MDBX_REVERSEKEY */
#define SHIFT_REVERSEDUP_TO_REVERSEKEY 5
STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) ==
MDBX_REVERSEKEY);
sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY;
return sub_flags;
}
static inline bool check_sdb_flags(unsigned flags) {
switch (flags & ~(MDBX_REVERSEKEY | MDBX_INTEGERKEY)) {
default:
NOTICE("invalid db-flags 0x%x", flags);
return false;
case MDBX_DUPSORT:
case MDBX_DUPSORT | MDBX_REVERSEDUP:
case MDBX_DUPSORT | MDBX_DUPFIXED:
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP:
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP:
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
case MDBX_DB_DEFAULTS:
return (flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) !=
(MDBX_REVERSEKEY | MDBX_INTEGERKEY);
}
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION static inline size_t pgno2bytes(const MDBX_env *env,
size_t pgno) {
eASSERT(env, (1u << env->ps2ln) == env->ps);
return ((size_t)pgno) << env->ps2ln;
}
MDBX_NOTHROW_PURE_FUNCTION static inline page_t *pgno2page(const MDBX_env *env,
size_t pgno) {
return ptr_disp(env->dxb_mmap.base, pgno2bytes(env, pgno));
}
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t bytes2pgno(const MDBX_env *env,
size_t bytes) {
eASSERT(env, (env->ps >> env->ps2ln) == 1);
return (pgno_t)(bytes >> env->ps2ln);
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t
bytes_align2os_bytes(const MDBX_env *env, size_t bytes);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t
pgno_align2os_bytes(const MDBX_env *env, size_t pgno);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL pgno_t
pgno_align2os_pgno(const MDBX_env *env, size_t pgno);
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
largechunk_npages(const MDBX_env *env, size_t bytes) {
return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1;
}
MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val get_key(const node_t *node) {
MDBX_val key;
key.iov_len = node_ks(node);
key.iov_base = node_key(node);
return key;
}
static inline void get_key_optional(const node_t *node,
MDBX_val *keyptr /* __may_null */) {
if (keyptr)
*keyptr = get_key(node);
}
MDBX_NOTHROW_PURE_FUNCTION static inline void *page_data(const page_t *mp) {
return ptr_disp(mp, PAGEHDRSZ);
}
MDBX_NOTHROW_PURE_FUNCTION static inline const page_t *
data_page(const void *data) {
return container_of(data, page_t, entries);
}
MDBX_NOTHROW_PURE_FUNCTION static inline meta_t *page_meta(page_t *mp) {
return (meta_t *)page_data(mp);
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) {
return mp->lower >> 1;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) {
return mp->upper - mp->lower;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
page_space(const MDBX_env *env) {
STATIC_ASSERT(PAGEHDRSZ % 2 == 0);
return env->ps - PAGEHDRSZ;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_used(const MDBX_env *env,
const page_t *mp) {
return page_space(env) - page_room(mp);
}
/* The percentage of space used in the page, in a percents. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline unsigned
page_fill_percentum_x10(const MDBX_env *env, const page_t *mp) {
const size_t space = page_space(env);
return (unsigned)((page_used(env, mp) * 1000 + space / 2) / space);
}
MDBX_NOTHROW_PURE_FUNCTION static inline node_t *page_node(const page_t *mp,
size_t i) {
assert(page_type_compat(mp) == P_LEAF || page_type(mp) == P_BRANCH);
assert(page_numkeys(mp) > i);
assert(mp->entries[i] % 2 == 0);
return ptr_disp(mp, mp->entries[i] + PAGEHDRSZ);
}
MDBX_NOTHROW_PURE_FUNCTION static inline void *
page_dupfix_ptr(const page_t *mp, size_t i, size_t keysize) {
assert(page_type_compat(mp) == (P_LEAF | P_DUPFIX) && i == (indx_t)i &&
mp->dupfix_ksize == keysize);
(void)keysize;
return ptr_disp(mp, PAGEHDRSZ + mp->dupfix_ksize * (indx_t)i);
}
MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val
page_dupfix_key(const page_t *mp, size_t i, size_t keysize) {
MDBX_val r;
r.iov_base = page_dupfix_ptr(mp, i, keysize);
r.iov_len = mp->dupfix_ksize;
return r;
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b);
#if MDBX_UNALIGNED_OK < 2 || \
(MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG))
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
/* Compare two items pointing at 2-byte aligned unsigned int's. */
cmp_int_align2(const MDBX_val *a, const MDBX_val *b);
#else
#define cmp_int_align2 cmp_int_unaligned
#endif /* !MDBX_UNALIGNED_OK || debug */
#if MDBX_UNALIGNED_OK < 4 || \
(MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG))
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
/* Compare two items pointing at 4-byte aligned unsigned int's. */
cmp_int_align4(const MDBX_val *a, const MDBX_val *b);
#else
#define cmp_int_align4 cmp_int_unaligned
#endif /* !MDBX_UNALIGNED_OK || debug */
/* Compare two items lexically */
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lexical(const MDBX_val *a,
const MDBX_val *b);
/* Compare two items in reverse byte order */
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_reverse(const MDBX_val *a,
const MDBX_val *b);
/* Fast non-lexically comparator */
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lenfast(const MDBX_val *a,
const MDBX_val *b);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL bool
eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l);
MDBX_NOTHROW_PURE_FUNCTION static inline bool eq_fast(const MDBX_val *a,
const MDBX_val *b) {
return unlikely(a->iov_len == b->iov_len) &&
eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len);
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b);
static inline MDBX_cmp_func *builtin_keycmp(MDBX_db_flags_t flags) {
return (flags & MDBX_REVERSEKEY) ? cmp_reverse
: (flags & MDBX_INTEGERKEY) ? cmp_int_align2
: cmp_lexical;
}
static inline MDBX_cmp_func *builtin_datacmp(MDBX_db_flags_t flags) {
return !(flags & MDBX_DUPSORT)
? cmp_lenfast
: ((flags & MDBX_INTEGERDUP)
? cmp_int_unaligned
: ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical));
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL uint32_t combine_durability_flags(const uint32_t a,
const uint32_t b);
MDBX_CONST_FUNCTION static inline lck_t *lckless_stub(const MDBX_env *env) {
uintptr_t stub = (uintptr_t)&env->lckless_placeholder;
/* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */
stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1);
return (lck_t *)stub;
}
#if !(defined(_WIN32) || defined(_WIN64))
MDBX_MAYBE_UNUSED static inline int ignore_enosys(int err) {
#ifdef ENOSYS
if (err == ENOSYS)
return MDBX_RESULT_TRUE;
#endif /* ENOSYS */
#ifdef ENOIMPL
if (err == ENOIMPL)
return MDBX_RESULT_TRUE;
#endif /* ENOIMPL */
#ifdef ENOTSUP
if (err == ENOTSUP)
return MDBX_RESULT_TRUE;
#endif /* ENOTSUP */
#ifdef ENOSUPP
if (err == ENOSUPP)
return MDBX_RESULT_TRUE;
#endif /* ENOSUPP */
#ifdef EOPNOTSUPP
if (err == EOPNOTSUPP)
return MDBX_RESULT_TRUE;
#endif /* EOPNOTSUPP */
if (err == EAGAIN)
return MDBX_RESULT_TRUE;
return err;
}
#endif /* defined(_WIN32) || defined(_WIN64) */
static inline int check_env(const MDBX_env *env, const bool wanna_active) {
if (unlikely(!env))
return MDBX_EINVAL;
if (unlikely(env->signature.weak != env_signature))
return MDBX_EBADSIGN;
if (unlikely(env->flags & ENV_FATAL_ERROR))
return MDBX_PANIC;
if (wanna_active) {
#if MDBX_ENV_CHECKPID
if (unlikely(env->pid != osal_getpid()) && env->pid) {
((MDBX_env *)env)->flags |= ENV_FATAL_ERROR;
return MDBX_PANIC;
}
#endif /* MDBX_ENV_CHECKPID */
if (unlikely((env->flags & ENV_ACTIVE) == 0))
return MDBX_EPERM;
eASSERT(env, env->dxb_mmap.base != nullptr);
}
return MDBX_SUCCESS;
}
static inline int check_txn(const MDBX_txn *txn, int bad_bits) {
if (unlikely(!txn))
return MDBX_EINVAL;
if (unlikely(txn->signature != txn_signature))
return MDBX_EBADSIGN;
if (unlikely(txn->flags & bad_bits))
return MDBX_BAD_TXN;
tASSERT(txn, (txn->flags & MDBX_TXN_FINISHED) ||
(txn->flags & MDBX_NOSTICKYTHREADS) ==
(txn->env->flags & MDBX_NOSTICKYTHREADS));
#if MDBX_TXN_CHECKOWNER
STATIC_ASSERT((long)MDBX_NOSTICKYTHREADS > (long)MDBX_TXN_FINISHED);
if ((txn->flags & (MDBX_NOSTICKYTHREADS | MDBX_TXN_FINISHED)) <
MDBX_TXN_FINISHED &&
unlikely(txn->owner != osal_thread_self()))
return txn->owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN;
#endif /* MDBX_TXN_CHECKOWNER */
if (bad_bits && unlikely(!txn->env->dxb_mmap.base))
return MDBX_EPERM;
return MDBX_SUCCESS;
}
static inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) {
int err = check_txn(txn, bad_bits);
if (unlikely(err))
return err;
if (unlikely(txn->flags & MDBX_TXN_RDONLY))
return MDBX_EACCESS;
return MDBX_SUCCESS;
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL void mincore_clean_cache(const MDBX_env *const env);
MDBX_INTERNAL void update_mlcnt(const MDBX_env *env,
const pgno_t new_aligned_mlocked_pgno,
const bool lock_not_release);
MDBX_INTERNAL void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno,
const size_t end_bytes);
MDBX_INTERNAL void munlock_all(const MDBX_env *env);
/*----------------------------------------------------------------------------*/
/* Cache coherence and mmap invalidation */
#ifndef MDBX_CPU_WRITEBACK_INCOHERENT
#error "The MDBX_CPU_WRITEBACK_INCOHERENT must be defined before"
#elif MDBX_CPU_WRITEBACK_INCOHERENT
#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier()
#else
#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier()
#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
MDBX_MAYBE_UNUSED static inline void
osal_flush_incoherent_mmap(const void *addr, size_t nbytes,
const intptr_t pagesize) {
#ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE
#error "The MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined before"
#elif MDBX_MMAP_INCOHERENT_FILE_WRITE
char *const begin = (char *)(-pagesize & (intptr_t)addr);
char *const end =
(char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1));
int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0;
eASSERT(nullptr, err == 0);
(void)err;
#else
(void)pagesize;
#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
#ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE
#error "The MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined before"
#elif MDBX_MMAP_INCOHERENT_CPU_CACHE
#ifdef DCACHE
/* MIPS has cache coherency issues.
* Note: for any nbytes >= on-chip cache size, entire is flushed. */
cacheflush((void *)addr, nbytes, DCACHE);
#else
#error "Oops, cacheflush() not available"
#endif /* DCACHE */
#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
#if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE
(void)addr;
(void)nbytes;
#endif
}

198
src/coherency.c Normal file
View File

@ -0,0 +1,198 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
/* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */
static bool coherency_check(const MDBX_env *env, const txnid_t txnid,
const volatile tree_t *trees,
const volatile meta_t *meta, bool report) {
const txnid_t freedb_mod_txnid = trees[FREE_DBI].mod_txnid;
const txnid_t maindb_mod_txnid = trees[MAIN_DBI].mod_txnid;
const pgno_t last_pgno = meta->geometry.now;
const pgno_t freedb_root_pgno = trees[FREE_DBI].root;
const page_t *freedb_root =
(env->dxb_mmap.base && freedb_root_pgno < last_pgno)
? pgno2page(env, freedb_root_pgno)
: nullptr;
const pgno_t maindb_root_pgno = trees[MAIN_DBI].root;
const page_t *maindb_root =
(env->dxb_mmap.base && maindb_root_pgno < last_pgno)
? pgno2page(env, maindb_root_pgno)
: nullptr;
const uint64_t magic_and_version =
unaligned_peek_u64_volatile(4, &meta->magic_and_version);
bool ok = true;
if (freedb_root_pgno != P_INVALID &&
unlikely(freedb_root_pgno >= last_pgno)) {
if (report)
WARNING(
"catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN
" %s",
"free", freedb_root_pgno, txnid,
(env->stuck_meta < 0)
? "(workaround for incoherent flaw of unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
if (maindb_root_pgno != P_INVALID &&
unlikely(maindb_root_pgno >= last_pgno)) {
if (report)
WARNING(
"catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN
" %s",
"main", maindb_root_pgno, txnid,
(env->stuck_meta < 0)
? "(workaround for incoherent flaw of unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
if (unlikely(txnid < freedb_mod_txnid ||
(!freedb_mod_txnid && freedb_root &&
likely(magic_and_version == MDBX_DATA_MAGIC)))) {
if (report)
WARNING(
"catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN
" %s",
"free", freedb_mod_txnid, txnid,
(env->stuck_meta < 0)
? "(workaround for incoherent flaw of unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
if (unlikely(txnid < maindb_mod_txnid ||
(!maindb_mod_txnid && maindb_root &&
likely(magic_and_version == MDBX_DATA_MAGIC)))) {
if (report)
WARNING(
"catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN
" %s",
"main", maindb_mod_txnid, txnid,
(env->stuck_meta < 0)
? "(workaround for incoherent flaw of unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
if (likely(freedb_root && freedb_mod_txnid)) {
VALGRIND_MAKE_MEM_DEFINED(freedb_root, sizeof(freedb_root->txnid));
MDBX_ASAN_UNPOISON_MEMORY_REGION(freedb_root, sizeof(freedb_root->txnid));
const txnid_t root_txnid = freedb_root->txnid;
if (unlikely(root_txnid != freedb_mod_txnid)) {
if (report)
WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN
" for %sdb.mod_txnid %" PRIaTXN " %s",
freedb_root_pgno, root_txnid, "free", freedb_mod_txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of "
"unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
}
if (likely(maindb_root && maindb_mod_txnid)) {
VALGRIND_MAKE_MEM_DEFINED(maindb_root, sizeof(maindb_root->txnid));
MDBX_ASAN_UNPOISON_MEMORY_REGION(maindb_root, sizeof(maindb_root->txnid));
const txnid_t root_txnid = maindb_root->txnid;
if (unlikely(root_txnid != maindb_mod_txnid)) {
if (report)
WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN
" for %sdb.mod_txnid %" PRIaTXN " %s",
maindb_root_pgno, root_txnid, "main", maindb_mod_txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of "
"unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
}
if (unlikely(!ok) && report)
env->lck->pgops.incoherence.weak =
(env->lck->pgops.incoherence.weak >= INT32_MAX)
? INT32_MAX
: env->lck->pgops.incoherence.weak + 1;
return ok;
}
__cold int coherency_timeout(uint64_t *timestamp, intptr_t pgno,
const MDBX_env *env) {
if (likely(timestamp && *timestamp == 0))
*timestamp = osal_monotime();
else if (unlikely(!timestamp || osal_monotime() - *timestamp >
osal_16dot16_to_monotime(65536 / 10))) {
if (pgno >= 0 && pgno != env->stuck_meta)
ERROR("bailout waiting for %" PRIuSIZE " page arrival %s", pgno,
"(workaround for incoherent flaw of unified page/buffer cache)");
else if (env->stuck_meta < 0)
ERROR("bailout waiting for valid snapshot (%s)",
"workaround for incoherent flaw of unified page/buffer cache");
return MDBX_PROBLEM;
}
osal_memory_fence(mo_AcquireRelease, true);
#if defined(_WIN32) || defined(_WIN64)
SwitchToThread();
#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
sched_yield();
#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
pthread_yield();
#else
usleep(42);
#endif
return MDBX_RESULT_TRUE;
}
/* check with timeout as the workaround
* for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */
__hot int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head,
uint64_t *timestamp) {
/* Copy the DB info and flags */
txn->geo = head.ptr_v->geometry;
memcpy(txn->dbs, &head.ptr_c->trees, sizeof(head.ptr_c->trees));
STATIC_ASSERT(sizeof(head.ptr_c->trees) == CORE_DBS * sizeof(tree_t));
VALGRIND_MAKE_MEM_UNDEFINED(txn->dbs + CORE_DBS,
txn->env->max_dbi - CORE_DBS);
txn->canary = head.ptr_v->canary;
if (unlikely(!coherency_check(txn->env, head.txnid, txn->dbs, head.ptr_v,
*timestamp == 0)))
return coherency_timeout(timestamp, -1, txn->env);
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
tASSERT(txn, check_sdb_flags(txn->dbs[MAIN_DBI].flags));
return MDBX_SUCCESS;
}
int coherency_check_written(const MDBX_env *env, const txnid_t txnid,
const volatile meta_t *meta, const intptr_t pgno,
uint64_t *timestamp) {
const bool report = !(timestamp && *timestamp);
const txnid_t head_txnid = meta_txnid(meta);
if (unlikely(head_txnid < MIN_TXNID || head_txnid < txnid)) {
if (report) {
env->lck->pgops.incoherence.weak =
(env->lck->pgops.incoherence.weak >= INT32_MAX)
? INT32_MAX
: env->lck->pgops.incoherence.weak + 1;
WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s",
(head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid,
bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)),
"(workaround for incoherent flaw of unified page/buffer cache)");
}
return coherency_timeout(timestamp, pgno, env);
}
if (unlikely(
!coherency_check(env, head_txnid, &meta->trees.gc, meta, report)))
return coherency_timeout(timestamp, pgno, env);
eASSERT(env, meta->trees.gc.flags == MDBX_INTEGERKEY);
eASSERT(env, check_sdb_flags(meta->trees.main.flags));
return MDBX_SUCCESS;
}
bool coherency_check_meta(const MDBX_env *env, const volatile meta_t *meta,
bool report) {
uint64_t timestamp = 0;
return coherency_check_written(env, 0, meta, -1,
report ? &timestamp : nullptr) == MDBX_SUCCESS;
}

768
src/cold.c Normal file
View File

@ -0,0 +1,768 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__cold size_t mdbx_default_pagesize(void) {
size_t pagesize = globals.sys_pagesize;
ENSURE(nullptr, is_powerof2(pagesize));
pagesize = (pagesize >= MDBX_MIN_PAGESIZE) ? pagesize : MDBX_MIN_PAGESIZE;
pagesize = (pagesize <= MDBX_MAX_PAGESIZE) ? pagesize : MDBX_MAX_PAGESIZE;
return pagesize;
}
__cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
return MIN_PAGENO * pagesize;
}
__cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize;
return (limit < MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE;
}
__cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
const uint64_t pgl_limit =
pagesize * (uint64_t)(PAGELIST_LIMIT / MDBX_GOLD_RATIO_DBL);
const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL);
return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit;
}
__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize,
MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
return keysize_max(pagesize, flags);
}
__cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env,
MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_keysize_max((intptr_t)env->ps, flags);
}
__cold int mdbx_env_get_maxkeysize(const MDBX_env *env) {
return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT);
}
__cold intptr_t mdbx_limits_keysize_min(MDBX_db_flags_t flags) {
return keysize_min(flags);
}
__cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize,
MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
return valsize_max(pagesize, flags);
}
__cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env,
MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_valsize_max((intptr_t)env->ps, flags);
}
__cold intptr_t mdbx_limits_valsize_min(MDBX_db_flags_t flags) {
return valsize_min(flags);
}
__cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize,
MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
if (flags &
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP))
return BRANCH_NODE_MAX(pagesize) - NODESIZE;
return LEAF_NODE_MAX(pagesize) - NODESIZE;
}
__cold int mdbx_env_get_pairsize4page_max(const MDBX_env *env,
MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_pairsize4page_max((intptr_t)env->ps, flags);
}
__cold intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize,
MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE ||
pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
if (flags &
(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP))
return valsize_max(pagesize, flags);
return PAGESPACE(pagesize);
}
__cold int mdbx_env_get_valsize4page_max(const MDBX_env *env,
MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_valsize4page_max((intptr_t)env->ps, flags);
}
/*----------------------------------------------------------------------------*/
__cold static void stat_add(const tree_t *db, MDBX_stat *const st,
const size_t bytes) {
st->ms_depth += db->height;
st->ms_branch_pages += db->branch_pages;
st->ms_leaf_pages += db->leaf_pages;
st->ms_overflow_pages += db->large_pages;
st->ms_entries += db->items;
if (likely(bytes >=
offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
st->ms_mod_txnid =
(st->ms_mod_txnid > db->mod_txnid) ? st->ms_mod_txnid : db->mod_txnid;
}
__cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) {
int err = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(err != MDBX_SUCCESS))
return err;
cursor_couple_t cx;
err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI);
if (unlikely(err != MDBX_SUCCESS))
return err;
const MDBX_env *const env = txn->env;
st->ms_psize = env->ps;
TXN_FOREACH_DBI_FROM(
txn, dbi,
/* assuming GC is internal and not subject for accounting */ MAIN_DBI) {
if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID)
stat_add(txn->dbs + dbi, st, bytes);
}
if (!(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT) &&
txn->dbs[MAIN_DBI].items /* TODO: use `md_subs` field */) {
/* scan and account not opened named subDBs */
err = tree_search(&cx.outer, nullptr, Z_FIRST);
while (err == MDBX_SUCCESS) {
const page_t *mp = cx.outer.pg[cx.outer.top];
for (size_t i = 0; i < page_numkeys(mp); i++) {
const node_t *node = page_node(mp, i);
if (node_flags(node) != N_SUBDATA)
continue;
if (unlikely(node_ds(node) != sizeof(tree_t))) {
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid subDb node size", node_ds(node));
return MDBX_CORRUPTED;
}
/* skip opened and already accounted */
const MDBX_val name = {node_key(node), node_ks(node)};
TXN_FOREACH_DBI_USER(txn, dbi) {
if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID &&
env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name) == 0) {
node = nullptr;
break;
}
}
if (node) {
tree_t db;
memcpy(&db, node_data(node), sizeof(db));
stat_add(&db, st, bytes);
}
}
err = cursor_sibling_right(&cx.outer);
}
if (unlikely(err != MDBX_NOTFOUND))
return err;
}
return MDBX_SUCCESS;
}
__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn,
MDBX_stat *dest, size_t bytes) {
if (unlikely(!dest))
return MDBX_EINVAL;
const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
return MDBX_EINVAL;
if (likely(txn)) {
if (env && unlikely(txn->env != env))
return MDBX_EINVAL;
return stat_acc(txn, dest, bytes);
}
int err = check_env(env, true);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (env->txn && env_txn0_owned(env))
/* inside write-txn */
return stat_acc(env->txn, dest, bytes);
MDBX_txn *tmp_txn;
err = mdbx_txn_begin((MDBX_env *)env, nullptr, MDBX_TXN_RDONLY, &tmp_txn);
if (unlikely(err != MDBX_SUCCESS))
return err;
const int rc = stat_acc(tmp_txn, dest, bytes);
err = mdbx_txn_abort(tmp_txn);
if (unlikely(err != MDBX_SUCCESS))
return err;
return rc;
}
/*----------------------------------------------------------------------------*/
static size_t estimate_rss(size_t database_bytes) {
return database_bytes + database_bytes / 64 +
(512 + MDBX_WORDBITS * 16) * MEGABYTE;
}
__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
MDBX_warmup_flags_t flags,
unsigned timeout_seconds_16dot16) {
if (unlikely(env == nullptr && txn == nullptr))
return MDBX_EINVAL;
if (unlikely(flags >
(MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock |
MDBX_warmup_touchlimit | MDBX_warmup_release)))
return MDBX_EINVAL;
if (txn) {
int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
if (unlikely(err != MDBX_SUCCESS))
return err;
}
if (env) {
int err = check_env(env, false);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (txn && unlikely(txn->env != env))
return MDBX_EINVAL;
} else {
env = txn->env;
}
const uint64_t timeout_monotime =
(timeout_seconds_16dot16 && (flags & MDBX_warmup_force))
? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16)
: 0;
if (flags & MDBX_warmup_release)
munlock_all(env);
pgno_t used_pgno;
if (txn) {
used_pgno = txn->geo.first_unallocated;
} else {
const troika_t troika = meta_tap(env);
used_pgno = meta_recent(env, &troika).ptr_v->geometry.first_unallocated;
}
const size_t used_range = pgno_align2os_bytes(env, used_pgno);
const pgno_t mlock_pgno = bytes2pgno(env, used_range);
int rc = MDBX_SUCCESS;
if (flags & MDBX_warmup_touchlimit) {
const size_t estimated_rss = estimate_rss(used_range);
#if defined(_WIN32) || defined(_WIN64)
SIZE_T current_ws_lower, current_ws_upper;
if (GetProcessWorkingSetSize(GetCurrentProcess(), &current_ws_lower,
&current_ws_upper) &&
current_ws_lower < estimated_rss) {
const SIZE_T ws_lower = estimated_rss;
const SIZE_T ws_upper =
(MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048)
? ws_lower
: ws_lower + MDBX_WORDBITS * MEGABYTE * 32;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) {
rc = (int)GetLastError();
WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower,
ws_upper, rc);
}
}
#endif /* Windows */
#ifdef RLIMIT_RSS
struct rlimit rss;
if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) {
rss.rlim_cur = estimated_rss;
if (rss.rlim_max < estimated_rss)
rss.rlim_max = estimated_rss;
if (setrlimit(RLIMIT_RSS, &rss)) {
rc = errno;
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS",
(size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc);
}
}
#endif /* RLIMIT_RSS */
#ifdef RLIMIT_MEMLOCK
if (flags & MDBX_warmup_lock) {
struct rlimit memlock;
if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 &&
memlock.rlim_cur < estimated_rss) {
memlock.rlim_cur = estimated_rss;
if (memlock.rlim_max < estimated_rss)
memlock.rlim_max = estimated_rss;
if (setrlimit(RLIMIT_MEMLOCK, &memlock)) {
rc = errno;
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK",
(size_t)memlock.rlim_cur, (size_t)memlock.rlim_max, rc);
}
}
}
#endif /* RLIMIT_MEMLOCK */
(void)estimated_rss;
}
#if defined(MLOCK_ONFAULT) && \
((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || \
(defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \
(defined(__linux__) || defined(__gnu_linux__))
if ((flags & MDBX_warmup_lock) != 0 &&
globals.linux_kernel_version >= 0x04040000 &&
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
if (mlock2(env->dxb_mmap.base, used_range, MLOCK_ONFAULT)) {
rc = errno;
WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc);
} else {
update_mlcnt(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
}
if (rc != EINVAL)
flags -= MDBX_warmup_lock;
}
#endif /* MLOCK_ONFAULT */
int err = MDBX_ENOSYS;
#if MDBX_ENABLE_MADVISE
err = dxb_set_readahead(env, used_pgno, true, true);
#else
#if defined(_WIN32) || defined(_WIN64)
if (imports.PrefetchVirtualMemory) {
WIN32_MEMORY_RANGE_ENTRY hint;
hint.VirtualAddress = env->dxb_mmap.base;
hint.NumberOfBytes = used_range;
if (imports.PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0))
err = MDBX_SUCCESS;
else {
err = (int)GetLastError();
ERROR("%s(%zu) error %d", "PrefetchVirtualMemory", used_range, err);
}
}
#endif /* Windows */
#if defined(POSIX_MADV_WILLNEED)
err = posix_madvise(env->dxb_mmap.base, used_range, POSIX_MADV_WILLNEED)
? ignore_enosys(errno)
: MDBX_SUCCESS;
#elif defined(MADV_WILLNEED)
err = madvise(env->dxb_mmap.base, used_range, MADV_WILLNEED)
? ignore_enosys(errno)
: MDBX_SUCCESS;
#endif
#if defined(F_RDADVISE)
if (err) {
fcntl(env->lazy_fd, F_RDAHEAD, true);
struct radvisory hint;
hint.ra_offset = 0;
hint.ra_count = unlikely(used_range > INT_MAX &&
sizeof(used_range) > sizeof(hint.ra_count))
? INT_MAX
: (int)used_range;
err = fcntl(env->lazy_fd, F_RDADVISE, &hint) ? ignore_enosys(errno)
: MDBX_SUCCESS;
if (err == ENOTTY)
err = MDBX_SUCCESS /* Ignore ENOTTY for DB on the ram-disk */;
}
#endif /* F_RDADVISE */
#endif /* MDBX_ENABLE_MADVISE */
if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS)
rc = err;
if ((flags & MDBX_warmup_force) != 0 &&
(rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) {
const volatile uint8_t *ptr = env->dxb_mmap.base;
size_t offset = 0, unused = 42;
#if !(defined(_WIN32) || defined(_WIN64))
if (flags & MDBX_warmup_oomsafe) {
const int null_fd = open("/dev/null", O_WRONLY);
if (unlikely(null_fd < 0))
rc = errno;
else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
for (;;) {
unsigned i;
for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) {
iov[i].iov_base = (void *)(ptr + offset);
iov[i].iov_len = 1;
offset += globals.sys_pagesize;
}
if (unlikely(writev(null_fd, iov, i) < 0)) {
rc = errno;
if (rc == EFAULT)
rc = ENOMEM;
break;
}
if (offset >= used_range) {
rc = MDBX_SUCCESS;
break;
}
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
rc = MDBX_RESULT_TRUE;
break;
}
}
close(null_fd);
}
} else
#endif /* Windows */
for (;;) {
unused += ptr[offset];
offset += globals.sys_pagesize;
if (offset >= used_range) {
rc = MDBX_SUCCESS;
break;
}
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
rc = MDBX_RESULT_TRUE;
break;
}
}
(void)unused;
}
if ((flags & MDBX_warmup_lock) != 0 &&
(rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) &&
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
#if defined(_WIN32) || defined(_WIN64)
if (VirtualLock(env->dxb_mmap.base, used_range)) {
update_mlcnt(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
} else {
rc = (int)GetLastError();
WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc);
}
#elif defined(_POSIX_MEMLOCK_RANGE)
if (mlock(env->dxb_mmap.base, used_range) == 0) {
update_mlcnt(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
} else {
rc = errno;
WARNING("%s(%zu) error %d", "mlock", used_range, rc);
}
#else
rc = MDBX_ENOSYS;
#endif
}
return rc;
}
/*----------------------------------------------------------------------------*/
__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!arg))
return MDBX_EINVAL;
*arg = env->lazy_fd;
return MDBX_SUCCESS;
}
__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags,
bool onoff) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(flags & ((env->flags & ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS
: ~ENV_USABLE_FLAGS)))
return MDBX_EPERM;
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
const bool lock_needed = (env->flags & ENV_ACTIVE) && !env_txn0_owned(env);
bool should_unlock = false;
if (lock_needed) {
rc = lck_txn_lock(env, false);
if (unlikely(rc))
return rc;
should_unlock = true;
}
if (onoff)
env->flags = combine_durability_flags(env->flags, flags);
else
env->flags &= ~flags;
if (should_unlock)
lck_txn_unlock(env);
return MDBX_SUCCESS;
}
__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!arg))
return MDBX_EINVAL;
*arg = env->flags & ENV_USABLE_FLAGS;
return MDBX_SUCCESS;
}
__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
env->userctx = ctx;
return MDBX_SUCCESS;
}
__cold void *mdbx_env_get_userctx(const MDBX_env *env) {
return env ? env->userctx : nullptr;
}
__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
#if MDBX_DEBUG
env->assert_func = func;
return MDBX_SUCCESS;
#else
(void)func;
return MDBX_ENOSYS;
#endif
}
__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
env->hsr_callback = hsr;
return MDBX_SUCCESS;
}
__cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) {
return likely(env && env->signature.weak == env_signature) ? env->hsr_callback
: nullptr;
}
#if defined(_WIN32) || defined(_WIN64)
__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!arg))
return MDBX_EINVAL;
*arg = env->pathname.specified;
return MDBX_SUCCESS;
}
#endif /* Windows */
__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!arg))
return MDBX_EINVAL;
#if defined(_WIN32) || defined(_WIN64)
if (!env->pathname_char) {
*arg = nullptr;
DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80;
size_t mb_len =
WideCharToMultiByte(CP_THREAD_ACP, flags, env->pathname.specified, -1,
nullptr, 0, nullptr, nullptr);
rc = mb_len ? MDBX_SUCCESS : (int)GetLastError();
if (rc == ERROR_INVALID_FLAGS) {
mb_len =
WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->pathname.specified,
-1, nullptr, 0, nullptr, nullptr);
rc = mb_len ? MDBX_SUCCESS : (int)GetLastError();
}
if (unlikely(rc != MDBX_SUCCESS))
return rc;
char *const mb_pathname = osal_malloc(mb_len);
if (!mb_pathname)
return MDBX_ENOMEM;
if (mb_len != (size_t)WideCharToMultiByte(
CP_THREAD_ACP, flags, env->pathname.specified, -1,
mb_pathname, (int)mb_len, nullptr, nullptr)) {
rc = (int)GetLastError();
osal_free(mb_pathname);
return rc;
}
if (env->pathname_char ||
InterlockedCompareExchangePointer((PVOID volatile *)&env->pathname_char,
mb_pathname, nullptr))
osal_free(mb_pathname);
}
*arg = env->pathname_char;
#else
*arg = env->pathname.specified;
#endif /* Windows */
return MDBX_SUCCESS;
}
/*------------------------------------------------------------------------------
* Legacy API */
#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent,
MDBX_txn_flags_t flags, MDBX_txn **ret) {
return __inline_mdbx_txn_begin(env, parent, flags, ret);
}
LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn) {
return __inline_mdbx_txn_commit(txn);
}
LIBMDBX_API __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat,
size_t bytes) {
return __inline_mdbx_env_stat(env, stat, bytes);
}
LIBMDBX_API __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info,
size_t bytes) {
return __inline_mdbx_env_info(env, info, bytes);
}
LIBMDBX_API int mdbx_dbi_flags(const MDBX_txn *txn, MDBX_dbi dbi,
unsigned *flags) {
return __inline_mdbx_dbi_flags(txn, dbi, flags);
}
LIBMDBX_API __cold int mdbx_env_sync(MDBX_env *env) {
return __inline_mdbx_env_sync(env);
}
LIBMDBX_API __cold int mdbx_env_sync_poll(MDBX_env *env) {
return __inline_mdbx_env_sync_poll(env);
}
LIBMDBX_API __cold int mdbx_env_close(MDBX_env *env) {
return __inline_mdbx_env_close(env);
}
LIBMDBX_API __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) {
return __inline_mdbx_env_set_mapsize(env, size);
}
LIBMDBX_API __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) {
return __inline_mdbx_env_set_maxdbs(env, dbs);
}
LIBMDBX_API __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) {
return __inline_mdbx_env_get_maxdbs(env, dbs);
}
LIBMDBX_API __cold int mdbx_env_set_maxreaders(MDBX_env *env,
unsigned readers) {
return __inline_mdbx_env_set_maxreaders(env, readers);
}
LIBMDBX_API __cold int mdbx_env_get_maxreaders(const MDBX_env *env,
unsigned *readers) {
return __inline_mdbx_env_get_maxreaders(env, readers);
}
LIBMDBX_API __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) {
return __inline_mdbx_env_set_syncbytes(env, threshold);
}
LIBMDBX_API __cold int mdbx_env_get_syncbytes(const MDBX_env *env,
size_t *threshold) {
return __inline_mdbx_env_get_syncbytes(env, threshold);
}
LIBMDBX_API __cold int mdbx_env_set_syncperiod(MDBX_env *env,
unsigned seconds_16dot16) {
return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16);
}
LIBMDBX_API __cold int mdbx_env_get_syncperiod(const MDBX_env *env,
unsigned *seconds_16dot16) {
return __inline_mdbx_env_get_syncperiod(env, seconds_16dot16);
}
LIBMDBX_API __cold uint64_t mdbx_key_from_int64(const int64_t i64) {
return __inline_mdbx_key_from_int64(i64);
}
LIBMDBX_API __cold uint32_t mdbx_key_from_int32(const int32_t i32) {
return __inline_mdbx_key_from_int32(i32);
}
LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_min(void) {
return __inline_mdbx_limits_pgsize_min();
}
LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_max(void) {
return __inline_mdbx_limits_pgsize_max();
}
#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */

View File

@ -11,6 +11,9 @@
#cmakedefine ENABLE_ASAN
#cmakedefine ENABLE_UBSAN
#cmakedefine01 MDBX_FORCE_ASSERTIONS
#if !defined(MDBX_BUILD_TEST) && !defined(MDBX_BUILD_CXX)
#cmakedefine01 MDBX_BUILD_CXX
#endif
/* Common */
#cmakedefine01 MDBX_TXN_CHECKOWNER
@ -37,7 +40,9 @@
#cmakedefine01 MDBX_ENABLE_DBI_LOCKFREE
/* Windows */
#if !defined(MDBX_BUILD_TEST) && !defined(MDBX_WITHOUT_MSVC_CRT)
#cmakedefine01 MDBX_WITHOUT_MSVC_CRT
#endif
/* MacOS & iOS */
#cmakedefine01 MDBX_OSX_SPEED_INSTEADOF_DURABILITY

781
src/copy.c Normal file
View File

@ -0,0 +1,781 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
typedef struct compacting_context {
MDBX_env *env;
MDBX_txn *txn;
pgno_t first_unallocated;
osal_condpair_t condpair;
volatile unsigned head;
volatile unsigned tail;
uint8_t *write_buf[2];
size_t write_len[2];
/* Error code. Never cleared if set. Both threads can set nonzero
* to fail the copy. Not mutex-protected, expects atomic int. */
volatile int error;
mdbx_filehandle_t fd;
} ctx_t;
__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree);
/* Dedicated writer thread for compacting copy. */
__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
ctx_t *const ctx = arg;
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
sigset_t sigset;
sigemptyset(&sigset);
sigaddset(&sigset, SIGPIPE);
ctx->error = pthread_sigmask(SIG_BLOCK, &sigset, nullptr);
#endif /* EPIPE */
osal_condpair_lock(&ctx->condpair);
while (!ctx->error) {
while (ctx->tail == ctx->head && !ctx->error) {
int err = osal_condpair_wait(&ctx->condpair, true);
if (err != MDBX_SUCCESS) {
ctx->error = err;
goto bailout;
}
}
const unsigned toggle = ctx->tail & 1;
size_t wsize = ctx->write_len[toggle];
if (wsize == 0) {
ctx->tail += 1;
break /* EOF */;
}
ctx->write_len[toggle] = 0;
uint8_t *ptr = ctx->write_buf[toggle];
if (!ctx->error) {
int err = osal_write(ctx->fd, ptr, wsize);
if (err != MDBX_SUCCESS) {
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
if (err == EPIPE) {
/* Collect the pending SIGPIPE,
* otherwise at least OS X gives it to the process on thread-exit. */
int unused;
sigwait(&sigset, &unused);
}
#endif /* EPIPE */
ctx->error = err;
goto bailout;
}
}
ctx->tail += 1;
osal_condpair_signal(&ctx->condpair, false);
}
bailout:
osal_condpair_unlock(&ctx->condpair);
return (THREAD_RESULT)0;
}
/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
__cold static int compacting_toggle_write_buffers(ctx_t *ctx) {
osal_condpair_lock(&ctx->condpair);
eASSERT(ctx->env, ctx->head - ctx->tail < 2 || ctx->error);
ctx->head += 1;
osal_condpair_signal(&ctx->condpair, true);
while (!ctx->error && ctx->head - ctx->tail == 2 /* both buffers in use */) {
int err = osal_condpair_wait(&ctx->condpair, false);
if (err != MDBX_SUCCESS)
ctx->error = err;
}
osal_condpair_unlock(&ctx->condpair);
return ctx->error;
}
static int compacting_put_bytes(ctx_t *ctx, const void *src, size_t bytes,
pgno_t pgno, pgno_t npages) {
assert(pgno == 0 || bytes > PAGEHDRSZ);
while (bytes > 0) {
const size_t side = ctx->head & 1;
const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->write_len[side];
if (left < (pgno ? PAGEHDRSZ : 1)) {
int err = compacting_toggle_write_buffers(ctx);
if (unlikely(err != MDBX_SUCCESS))
return err;
continue;
}
const size_t chunk = (bytes < left) ? bytes : left;
void *const dst = ctx->write_buf[side] + ctx->write_len[side];
if (src) {
memcpy(dst, src, chunk);
if (pgno) {
assert(chunk > PAGEHDRSZ);
page_t *mp = dst;
mp->pgno = pgno;
if (mp->txnid == 0)
mp->txnid = ctx->txn->txnid;
if (mp->flags == P_LARGE) {
assert(bytes <= pgno2bytes(ctx->env, npages));
mp->pages = npages;
}
pgno = 0;
}
src = ptr_disp(src, chunk);
} else
memset(dst, 0, chunk);
bytes -= chunk;
ctx->write_len[side] += chunk;
}
return MDBX_SUCCESS;
}
static int compacting_put_page(ctx_t *ctx, const page_t *mp,
const size_t head_bytes, const size_t tail_bytes,
const pgno_t npages) {
if (tail_bytes) {
assert(head_bytes + tail_bytes <= ctx->env->ps);
assert(npages == 1 &&
(page_type(mp) == P_BRANCH || page_type(mp) == P_LEAF));
} else {
assert(head_bytes <= pgno2bytes(ctx->env, npages));
assert((npages == 1 && page_type(mp) == (P_LEAF | P_DUPFIX)) ||
page_type(mp) == P_LARGE);
}
const pgno_t pgno = ctx->first_unallocated;
ctx->first_unallocated += npages;
int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages);
if (unlikely(err != MDBX_SUCCESS))
return err;
err = compacting_put_bytes(
ctx, nullptr, pgno2bytes(ctx->env, npages) - (head_bytes + tail_bytes), 0,
0);
if (unlikely(err != MDBX_SUCCESS))
return err;
return compacting_put_bytes(ctx, ptr_disp(mp, ctx->env->ps - tail_bytes),
tail_bytes, 0, 0);
}
__cold static int compacting_walk(ctx_t *ctx, MDBX_cursor *mc,
pgno_t *const parent_pgno,
txnid_t parent_txnid) {
mc->top = 0;
mc->ki[0] = 0;
int rc = page_get(mc, *parent_pgno, &mc->pg[0], parent_txnid);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = tree_search_finalize(mc, nullptr, Z_FIRST);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
/* Make cursor pages writable */
const intptr_t deep_limit = mc->top + 1;
void *const buf = osal_malloc(pgno2bytes(ctx->env, deep_limit + 1));
if (buf == nullptr)
return MDBX_ENOMEM;
void *ptr = buf;
for (intptr_t i = 0; i <= mc->top; i++) {
page_copy(ptr, mc->pg[i], ctx->env->ps);
mc->pg[i] = ptr;
ptr = ptr_disp(ptr, ctx->env->ps);
}
/* This is writable space for a leaf page. Usually not needed. */
page_t *const leaf = ptr;
while (mc->top >= 0) {
page_t *mp = mc->pg[mc->top];
const size_t nkeys = page_numkeys(mp);
if (is_leaf(mp)) {
if (!(mc->flags &
z_inner) /* may have nested N_SUBDATA or N_BIGDATA nodes */) {
for (size_t i = 0; i < nkeys; i++) {
node_t *node = page_node(mp, i);
if (node_flags(node) == N_BIGDATA) {
/* Need writable leaf */
if (mp != leaf) {
mc->pg[mc->top] = leaf;
page_copy(leaf, mp, ctx->env->ps);
mp = leaf;
node = page_node(mp, i);
}
const pgr_t lp =
page_get_large(mc, node_largedata_pgno(node), mp->txnid);
if (unlikely((rc = lp.err) != MDBX_SUCCESS))
goto bailout;
const size_t datasize = node_ds(node);
const pgno_t npages = largechunk_npages(ctx->env, datasize);
poke_pgno(node_data(node), ctx->first_unallocated);
rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0,
npages);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
} else if (node_flags(node) & N_SUBDATA) {
if (!MDBX_DISABLE_VALIDATION &&
unlikely(node_ds(node) != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid dupsort sub-tree node size",
(unsigned)node_ds(node));
rc = MDBX_CORRUPTED;
goto bailout;
}
/* Need writable leaf */
if (mp != leaf) {
mc->pg[mc->top] = leaf;
page_copy(leaf, mp, ctx->env->ps);
mp = leaf;
node = page_node(mp, i);
}
tree_t *nested = nullptr;
if (node_flags(node) & N_DUPDATA) {
rc = cursor_dupsort_setup(mc, node, mp);
if (likely(rc == MDBX_SUCCESS)) {
nested = &mc->subcur->nested_tree;
rc = compacting_walk(ctx, &mc->subcur->cursor, &nested->root,
mp->txnid);
}
} else {
cASSERT(mc, (mc->flags & z_inner) == 0 && mc->subcur == 0);
cursor_couple_t *couple =
container_of(mc, cursor_couple_t, outer);
nested = &couple->inner.nested_tree;
memcpy(nested, node_data(node), sizeof(tree_t));
rc = compacting_walk_tree(ctx, nested);
}
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
memcpy(node_data(node), nested, sizeof(tree_t));
}
}
}
} else {
mc->ki[mc->top]++;
if (mc->ki[mc->top] < nkeys) {
for (;;) {
const node_t *node = page_node(mp, mc->ki[mc->top]);
rc = page_get(mc, node_pgno(node), &mp, mp->txnid);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
mc->top += 1;
if (unlikely(mc->top >= deep_limit)) {
rc = MDBX_CURSOR_FULL;
goto bailout;
}
mc->ki[mc->top] = 0;
if (!is_branch(mp)) {
mc->pg[mc->top] = mp;
break;
}
/* Whenever we advance to a sibling branch page,
* we must proceed all the way down to its first leaf. */
page_copy(mc->pg[mc->top], mp, ctx->env->ps);
}
continue;
}
}
const pgno_t pgno = ctx->first_unallocated;
if (likely(!is_dupfix_leaf(mp))) {
rc = compacting_put_page(ctx, mp, PAGEHDRSZ + mp->lower,
ctx->env->ps - (PAGEHDRSZ + mp->upper), 1);
} else {
rc = compacting_put_page(
ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->dupfix_ksize, 0, 1);
}
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
if (mc->top) {
/* Update parent if there is one */
node_set_pgno(page_node(mc->pg[mc->top - 1], mc->ki[mc->top - 1]), pgno);
cursor_pop(mc);
} else {
/* Otherwise we're done */
*parent_pgno = pgno;
break;
}
}
bailout:
osal_free(buf);
return rc;
}
__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree) {
if (unlikely(tree->root == P_INVALID))
return MDBX_SUCCESS; /* empty db */
cursor_couple_t couple;
memset(&couple, 0, sizeof(couple));
couple.inner.cursor.signature = ~cur_signature_live;
kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}};
int rc = cursor_init4walk(&couple, ctx->txn, tree, &kvx);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
couple.outer.checking |= z_ignord | z_pagecheck;
couple.inner.cursor.checking |= z_ignord | z_pagecheck;
if (!tree->mod_txnid)
tree->mod_txnid = ctx->txn->txnid;
return compacting_walk(ctx, &couple.outer, &tree->root, tree->mod_txnid);
}
__cold static void compacting_fixup_meta(MDBX_env *env, meta_t *meta) {
eASSERT(env, meta->trees.gc.mod_txnid || meta->trees.gc.root == P_INVALID);
eASSERT(env,
meta->trees.main.mod_txnid || meta->trees.main.root == P_INVALID);
/* Calculate filesize taking in account shrink/growing thresholds */
if (meta->geometry.first_unallocated != meta->geometry.now) {
meta->geometry.now = meta->geometry.first_unallocated;
const size_t aligner =
pv2pages(meta->geometry.grow_pv ? meta->geometry.grow_pv
: meta->geometry.shrink_pv);
if (aligner) {
const pgno_t aligned = pgno_align2os_pgno(
env, meta->geometry.first_unallocated + aligner -
meta->geometry.first_unallocated % aligner);
meta->geometry.now = aligned;
}
}
if (meta->geometry.now < meta->geometry.lower)
meta->geometry.now = meta->geometry.lower;
if (meta->geometry.now > meta->geometry.upper)
meta->geometry.now = meta->geometry.upper;
/* Update signature */
assert(meta->geometry.now >= meta->geometry.first_unallocated);
meta_sign_as_steady(meta);
}
/* Make resizable */
__cold static void meta_make_sizeable(meta_t *meta) {
meta->geometry.lower = MIN_PAGENO;
if (meta->geometry.grow_pv == 0) {
const pgno_t step = 1 + (meta->geometry.upper - meta->geometry.lower) / 42;
meta->geometry.grow_pv = pages2pv(step);
}
if (meta->geometry.shrink_pv == 0) {
const pgno_t step = pv2pages(meta->geometry.grow_pv) << 1;
meta->geometry.shrink_pv = pages2pv(step);
}
}
__cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *read_txn,
mdbx_filehandle_t fd, uint8_t *buffer,
const bool dest_is_pipe,
const MDBX_copy_flags_t flags) {
const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
uint8_t *const data_buffer =
buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize);
meta_t *const meta = meta_init_triplet(env, buffer);
meta_set_txnid(env, meta, read_txn->txnid);
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
meta_make_sizeable(meta);
/* copy canary sequences if present */
if (read_txn->canary.v) {
meta->canary = read_txn->canary;
meta->canary.v = constmeta_txnid(meta);
}
if (read_txn->dbs[MAIN_DBI].root == P_INVALID) {
/* When the DB is empty, handle it specially to
* fix any breakage like page leaks from ITS#8174. */
meta->trees.main.flags = read_txn->dbs[MAIN_DBI].flags;
compacting_fixup_meta(env, meta);
if (dest_is_pipe) {
int rc = osal_write(fd, buffer, meta_bytes);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
} else {
/* Count free pages + GC pages. */
cursor_couple_t couple;
int rc = cursor_init(&couple.outer, read_txn, FREE_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
pgno_t gc_npages = read_txn->dbs[FREE_DBI].branch_pages +
read_txn->dbs[FREE_DBI].leaf_pages +
read_txn->dbs[FREE_DBI].large_pages;
MDBX_val key, data;
rc = outer_first(&couple.outer, &key, &data);
while (rc == MDBX_SUCCESS) {
const pnl_t pnl = data.iov_base;
if (unlikely(data.iov_len % sizeof(pgno_t) ||
data.iov_len < MDBX_PNL_SIZEOF(pnl))) {
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid GC-record length", data.iov_len);
return MDBX_CORRUPTED;
}
if (unlikely(!pnl_check(pnl, read_txn->geo.first_unallocated))) {
ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid GC-record content");
return MDBX_CORRUPTED;
}
gc_npages += MDBX_PNL_GETSIZE(pnl);
rc = outer_next(&couple.outer, &key, &data, MDBX_NEXT);
}
if (unlikely(rc != MDBX_NOTFOUND))
return rc;
meta->geometry.first_unallocated =
read_txn->geo.first_unallocated - gc_npages;
meta->trees.main = read_txn->dbs[MAIN_DBI];
ctx_t ctx;
memset(&ctx, 0, sizeof(ctx));
rc = osal_condpair_init(&ctx.condpair);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF);
ctx.write_buf[0] = data_buffer;
ctx.write_buf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF;
ctx.first_unallocated = NUM_METAS;
ctx.env = env;
ctx.fd = fd;
ctx.txn = read_txn;
osal_thread_t thread;
int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx);
if (likely(thread_err == MDBX_SUCCESS)) {
if (dest_is_pipe) {
if (!meta->trees.main.mod_txnid)
meta->trees.main.mod_txnid = read_txn->txnid;
compacting_fixup_meta(env, meta);
rc = osal_write(fd, buffer, meta_bytes);
}
if (likely(rc == MDBX_SUCCESS))
rc = compacting_walk_tree(&ctx, &meta->trees.main);
if (ctx.write_len[ctx.head & 1])
/* toggle to flush non-empty buffers */
compacting_toggle_write_buffers(&ctx);
if (likely(rc == MDBX_SUCCESS) &&
unlikely(meta->geometry.first_unallocated != ctx.first_unallocated)) {
if (ctx.first_unallocated > meta->geometry.first_unallocated) {
ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO
" %c expected %" PRIaPGNO,
"has double-used pages or other corruption",
ctx.first_unallocated, '>', meta->geometry.first_unallocated);
rc = MDBX_CORRUPTED; /* corrupted DB */
}
if (ctx.first_unallocated < meta->geometry.first_unallocated) {
WARNING(
"the source DB %s: post-compactification used pages %" PRIaPGNO
" %c expected %" PRIaPGNO,
"has page leak(s)", ctx.first_unallocated, '<',
meta->geometry.first_unallocated);
if (dest_is_pipe)
/* the root within already written meta-pages is wrong */
rc = MDBX_CORRUPTED;
}
/* fixup meta */
meta->geometry.first_unallocated = ctx.first_unallocated;
}
/* toggle with empty buffers to exit thread's loop */
eASSERT(env, (ctx.write_len[ctx.head & 1]) == 0);
compacting_toggle_write_buffers(&ctx);
thread_err = osal_thread_join(thread);
eASSERT(env, (ctx.tail == ctx.head && ctx.write_len[ctx.head & 1] == 0) ||
ctx.error);
osal_condpair_destroy(&ctx.condpair);
}
if (unlikely(thread_err != MDBX_SUCCESS))
return thread_err;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(ctx.error != MDBX_SUCCESS))
return ctx.error;
if (!dest_is_pipe)
compacting_fixup_meta(env, meta);
}
/* Extend file if required */
if (meta->geometry.now != meta->geometry.first_unallocated) {
const size_t whole_size = pgno2bytes(env, meta->geometry.now);
if (!dest_is_pipe)
return osal_ftruncate(fd, whole_size);
const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated);
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size; offset < whole_size;) {
const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
? (size_t)MDBX_ENVCOPY_WRITEBUF
: whole_size - offset;
int rc = osal_write(fd, data_buffer, chunk);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
offset += chunk;
}
}
return MDBX_SUCCESS;
}
__cold static int copy_asis(MDBX_env *env, MDBX_txn *read_txn,
mdbx_filehandle_t fd, uint8_t *buffer,
const bool dest_is_pipe,
const MDBX_copy_flags_t flags) {
int rc = txn_end(read_txn, TXN_END_RESET_TMP);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
/* Temporarily block writers until we snapshot the meta pages */
rc = lck_txn_lock(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = txn_renew(read_txn, MDBX_TXN_RDONLY);
if (unlikely(rc != MDBX_SUCCESS)) {
lck_txn_unlock(env);
return rc;
}
jitter4testing(false);
const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
const troika_t troika = meta_tap(env);
/* Make a snapshot of meta-pages,
* but writing ones after the data was flushed */
memcpy(buffer, env->dxb_mmap.base, meta_bytes);
meta_t *const headcopy = /* LY: get pointer to the snapshot copy */
ptr_disp(buffer,
ptr_dist(meta_recent(env, &troika).ptr_c, env->dxb_mmap.base));
lck_txn_unlock(env);
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
meta_make_sizeable(headcopy);
/* Update signature to steady */
meta_sign_as_steady(headcopy);
/* Copy the data */
const size_t whole_size = pgno_align2os_bytes(env, read_txn->geo.end_pgno);
const size_t used_size = pgno2bytes(env, read_txn->geo.first_unallocated);
jitter4testing(false);
if (dest_is_pipe)
rc = osal_write(fd, buffer, meta_bytes);
uint8_t *const data_buffer =
buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize);
#if MDBX_USE_COPYFILERANGE
static bool copyfilerange_unavailable;
bool not_the_same_filesystem = false;
struct statfs statfs_info;
if (fstatfs(fd, &statfs_info) ||
statfs_info.f_type == /* ECRYPTFS_SUPER_MAGIC */ 0xf15f)
/* avoid use copyfilerange_unavailable() to ecryptfs due bugs */
not_the_same_filesystem = true;
#endif /* MDBX_USE_COPYFILERANGE */
for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) {
#if MDBX_USE_SENDFILE
static bool sendfile_unavailable;
if (dest_is_pipe && likely(!sendfile_unavailable)) {
off_t in_offset = offset;
const ssize_t written =
sendfile(fd, env->lazy_fd, &in_offset, used_size - offset);
if (likely(written > 0)) {
offset = in_offset;
continue;
}
rc = MDBX_ENODATA;
if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE)
break;
sendfile_unavailable = true;
}
#endif /* MDBX_USE_SENDFILE */
#if MDBX_USE_COPYFILERANGE
if (!dest_is_pipe && !not_the_same_filesystem &&
likely(!copyfilerange_unavailable)) {
off_t in_offset = offset, out_offset = offset;
ssize_t bytes_copied = copy_file_range(
env->lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0);
if (likely(bytes_copied > 0)) {
offset = in_offset;
continue;
}
rc = MDBX_ENODATA;
if (bytes_copied == 0)
break;
rc = errno;
if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s),
maybe useful for others FS */
EINVAL)
not_the_same_filesystem = true;
else if (ignore_enosys(rc) == MDBX_RESULT_TRUE)
copyfilerange_unavailable = true;
else
break;
}
#endif /* MDBX_USE_COPYFILERANGE */
/* fallback to portable */
const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset)
? (size_t)MDBX_ENVCOPY_WRITEBUF
: used_size - offset;
/* copy to avoid EFAULT in case swapped-out */
memcpy(data_buffer, ptr_disp(env->dxb_mmap.base, offset), chunk);
rc = osal_write(fd, data_buffer, chunk);
offset += chunk;
}
/* Extend file if required */
if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) {
if (!dest_is_pipe)
rc = osal_ftruncate(fd, whole_size);
else {
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size;
rc == MDBX_SUCCESS && offset < whole_size;) {
const size_t chunk =
((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
? (size_t)MDBX_ENVCOPY_WRITEBUF
: whole_size - offset;
rc = osal_write(fd, data_buffer, chunk);
offset += chunk;
}
}
}
return rc;
}
__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
MDBX_copy_flags_t flags) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
const int dest_is_pipe = osal_is_pipe(fd);
if (MDBX_IS_ERROR(dest_is_pipe))
return dest_is_pipe;
if (!dest_is_pipe) {
rc = osal_fseek(fd, 0);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
const size_t buffer_size =
pgno_align2os_bytes(env, NUM_METAS) +
ceil_powerof2(((flags & MDBX_CP_COMPACT)
? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF
: (size_t)MDBX_ENVCOPY_WRITEBUF),
globals.sys_pagesize);
uint8_t *buffer = nullptr;
rc = osal_memalign_alloc(globals.sys_pagesize, buffer_size, (void **)&buffer);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
MDBX_txn *read_txn = nullptr;
/* Do the lock/unlock of the reader mutex before starting the
* write txn. Otherwise other read txns could block writers. */
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &read_txn);
if (unlikely(rc != MDBX_SUCCESS)) {
osal_memalign_free(buffer);
return rc;
}
if (!dest_is_pipe) {
/* Firstly write a stub to meta-pages.
* Now we sure to incomplete copy will not be used. */
memset(buffer, -1, pgno2bytes(env, NUM_METAS));
rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS));
}
if (likely(rc == MDBX_SUCCESS)) {
memset(buffer, 0, pgno2bytes(env, NUM_METAS));
rc = ((flags & MDBX_CP_COMPACT) ? copy_with_compacting : copy_asis)(
env, read_txn, fd, buffer, dest_is_pipe, flags);
}
mdbx_txn_abort(read_txn);
if (!dest_is_pipe) {
if (likely(rc == MDBX_SUCCESS))
rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
/* Write actual meta */
if (likely(rc == MDBX_SUCCESS))
rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
if (likely(rc == MDBX_SUCCESS))
rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
}
osal_memalign_free(buffer);
return rc;
}
__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path,
MDBX_copy_flags_t flags) {
#if defined(_WIN32) || defined(_WIN64)
wchar_t *dest_pathW = nullptr;
int rc = osal_mb2w(dest_path, &dest_pathW);
if (likely(rc == MDBX_SUCCESS)) {
rc = mdbx_env_copyW(env, dest_pathW, flags);
osal_free(dest_pathW);
}
return rc;
}
__cold int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path,
MDBX_copy_flags_t flags) {
#endif /* Windows */
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!dest_path))
return MDBX_EINVAL;
/* The destination path must exist, but the destination file must not.
* We don't want the OS to cache the writes, since the source data is
* already in the OS cache. */
mdbx_filehandle_t newfd;
rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd,
#if defined(_WIN32) || defined(_WIN64)
(mdbx_mode_t)-1
#else
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP
#endif
);
#if defined(_WIN32) || defined(_WIN64)
/* no locking required since the file opened with ShareMode == 0 */
#else
if (rc == MDBX_SUCCESS) {
MDBX_STRUCT_FLOCK lock_op;
memset(&lock_op, 0, sizeof(lock_op));
lock_op.l_type = F_WRLCK;
lock_op.l_whence = SEEK_SET;
lock_op.l_start = 0;
lock_op.l_len = OFF_T_MAX;
if (MDBX_FCNTL(newfd, MDBX_F_SETLK, &lock_op)
#if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \
(!defined(__ANDROID_API__) || __ANDROID_API__ >= 24)
|| flock(newfd, LOCK_EX | LOCK_NB)
#endif /* Linux */
)
rc = errno;
}
#endif /* Windows / POSIX */
if (rc == MDBX_SUCCESS)
rc = mdbx_env_copy2fd(env, newfd, flags);
if (newfd != INVALID_HANDLE_VALUE) {
int err = osal_closefile(newfd);
if (rc == MDBX_SUCCESS && err != rc)
rc = err;
if (rc != MDBX_SUCCESS)
(void)osal_removefile(dest_path);
}
return rc;
}

29506
src/core.c

File diff suppressed because it is too large Load Diff

2451
src/cursor.c Normal file

File diff suppressed because it is too large Load Diff

398
src/cursor.h Normal file
View File

@ -0,0 +1,398 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
/* Состояние курсора.
*
* плохой/poor:
* - неустановленный курсор с незаполненым стеком;
* - следует пропускать во всех циклах отслеживания/корректировки
* позиций курсоров;
* - допускаются только операции предполагающие установку абсолютной позиции;
* - в остальных случаях возвращается ENODATA.
*
* У таких курсоров top = -1 и flags < 0, что позволяет дешево проверять и
* пропускать такие курсоры в циклах отслеживания/корректировки по условию
* probe_cursor->top < this_cursor->top.
*
* пустой/hollow:
* - частично инициализированный курсор, но без доступной пользователю позиции,
* поэтому нельзя выполнить какую-либо операцию без абсолютного (не
* относительного) позиционирования;
* - ki[top] может быть некорректным, в том числе >= page_numkeys(pg[top]).
*
* У таких курсоров top >= 0, но flags < 0 (есть флажок z_hollow).
*
* установленный/pointed:
* - полностью инициализированный курсор с конкретной позицией с данными;
* - можно прочитать текущую строку, удалить её, либо выполнить
* относительное перемещение;
* - может иметь флажки z_after_delete, z_eof_hard и z_eof_soft;
* - наличие z_eof_soft означает что курсор перемещен за пределы данных,
* поэтому нелья прочитать текущие данные, либо удалить их.
*
* У таких курсоров top >= 0 и flags >= 0 (нет флажка z_hollow).
*
* наполненный данными/filled:
* - это установленный/pointed курсор без флагов z_eof_soft;
* - за курсором есть даные, возможны CRUD операции в текущей позиции.
*
* У таких курсоров top >= 0 и (unsigned)flags < z_eof_soft.
*
* Изменения состояния.
*
* - Сбрасывается состояние курсора посредством top_and_flags |= z_poor_mark,
* что равносильно top = -1 вместе с flags |= z_poor_mark;
* - При позиционировании курсора сначала устанавливается top, а flags
* только в самом конце при отсутстви ошибок.
* - Повторное позиционирование first/last может начинаться
* с установки/обнуления только top без сброса flags, что позволяет работать
* быстрому пути внутри tree_search_finalize().
*
* - Заморочки с концом данных:
* - mdbx_cursor_get(NEXT) выполняет две операции (перемещение и чтение),
* поэтому перемещение на последнюю строку строку всегда успешно,
* а ошибка возвращается только при последующем next().
* Однако, из-за этой двойственности семантика ситуации возврата ошибки
* из mdbx_cursor_get(NEXT) допускает разночтение/неопределенность, ибо
* не понятно к чему относится ошибка:
* - Если к чтению данных, то курсор перемещен и стоит после последней
* строки. Соответственно, чтение в текущей позиции запрещено,
* а при выполнении prev() курсор вернется на последнюю строку;
* - Если же ошибка относится к перемещению, то курсор не перемещен и
* остается на последней строке. Соответственно, чтение в текущей
* позиции допустимо, а при выполнении prev() курсор встанет
* на пред-последнюю строку.
* - Пикантность в том, что пользователи (так или иначе) полагаются
* на оба варианта поведения, при этом конечно ожидают что после
* ошибки MDBX_NEXT функция mdbx_cursor_eof() будет возвращать true.
* - далее добавляется схожая ситуация с MDBX_GET_RANGE, MDBX_LOWERBOUND,
* MDBX_GET_BOTH_RANGE и MDBX_UPPERBOUND. Тут при неуспехе поиска курсор
* может/должен стоять после последней строки.
* - далее добавляется MDBX_LAST. Тут курсор должен стоять на последней
* строке и допускать чтение в текузщей позиции,
* но mdbx_cursor_eof() должен возвращать true.
*
* Решение = делаем два флажка z_eof_soft и z_eof_hard:
* - Когда установлен только z_eof_soft,
* функция mdbx_cursor_eof() возвращает true, но допускается
* чтение данных в текущей позиции, а prev() передвигает курсор
* на пред-последнюю строку.
* - Когда установлен z_eof_hard, чтение данных в текущей позиции
* не допускается, и mdbx_cursor_eof() также возвращает true,
* а prev() устанавливает курсора на последюю строку. */
enum cursor_state {
/* Это вложенный курсор для вложенного дерева/страницы и является
inner-элементом struct cursor_couple. */
z_inner = 0x01,
/* Происходит подготовка к обновлению GC,
поэтому можно брать страницы из GC даже для FREE_DBI. */
z_gcu_preparation = 0x02,
/* Курсор только-что создан, поэтому допускается авто-установка
в начало/конец, вместо возврата ошибки. */
z_fresh = 0x04,
/* Предыдущей операцией было удаление, поэтому курсор уже физически указывает
на следующий элемент и соответствующая операция перемещения должна
игнорироваться. */
z_after_delete = 0x08,
/* */
z_disable_tree_search_fastpath = 0x10,
/* Курсор логически в конце данных, но физически на последней строке,
* ki[top] == page_numkeys(pg[top]) - 1 и читать данные в текущей позиции. */
z_eof_soft = 0x20,
/* Курсор логически за концом данных, поэтому следующий переход "назад"
должен игнорироваться и/или приводить к установке на последнюю строку.
В текущем же состоянии нельзя делать CRUD операции. */
z_eof_hard = 0x40,
/* За курсором нет данных, логически его позиция не определена,
нельзя делать CRUD операции в текущей позиции.
Относительное перемещение запрещено. */
z_hollow = -128 /* 0x80 */,
/* Маски для сброса/установки состояния. */
z_clear_mask = z_inner | z_gcu_preparation,
z_poor_mark = z_eof_hard | z_hollow | z_disable_tree_search_fastpath,
z_fresh_mark = z_poor_mark | z_fresh
};
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_inner(const MDBX_cursor *mc) {
return (mc->flags & z_inner) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_poor(const MDBX_cursor *mc) {
const bool r = mc->top < 0;
cASSERT(mc, r == (mc->top_and_flags < 0));
if (r && mc->subcur)
cASSERT(mc, mc->subcur->cursor.flags < 0 && mc->subcur->cursor.top < 0);
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_pointed(const MDBX_cursor *mc) {
const bool r = mc->top >= 0;
cASSERT(mc, r == (mc->top_and_flags >= 0));
if (!r && mc->subcur)
cASSERT(mc, is_poor(&mc->subcur->cursor));
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_hollow(const MDBX_cursor *mc) {
const bool r = mc->flags < 0;
if (!r) {
cASSERT(mc, mc->top >= 0);
cASSERT(mc, (mc->flags & z_eof_hard) ||
mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
} else if (mc->subcur)
cASSERT(mc, is_poor(&mc->subcur->cursor));
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_eof(const MDBX_cursor *mc) {
const bool r = z_eof_soft <= (uint8_t)mc->flags;
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_filled(const MDBX_cursor *mc) {
const bool r = z_eof_hard > (uint8_t)mc->flags;
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
inner_filled(const MDBX_cursor *mc) {
return mc->subcur && is_filled(&mc->subcur->cursor);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
inner_pointed(const MDBX_cursor *mc) {
return mc->subcur && is_pointed(&mc->subcur->cursor);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
inner_hollow(const MDBX_cursor *mc) {
return !mc->subcur || is_hollow(&mc->subcur->cursor);
}
MDBX_MAYBE_UNUSED static inline void inner_gone(MDBX_cursor *mc) {
if (mc->subcur) {
TRACE("reset inner cursor %p",
__Wpedantic_format_voidptr(&mc->subcur->cursor));
mc->subcur->nested_tree.root = 0;
mc->subcur->cursor.top_and_flags = z_inner | z_poor_mark;
}
}
MDBX_MAYBE_UNUSED static inline void be_poor(MDBX_cursor *mc) {
const bool inner = is_inner(mc);
if (inner) {
mc->tree->root = 0;
mc->top_and_flags = z_inner | z_poor_mark;
} else {
mc->top_and_flags |= z_poor_mark;
inner_gone(mc);
}
cASSERT(mc, is_poor(mc) && !is_pointed(mc) && !is_filled(mc));
cASSERT(mc, inner == is_inner(mc));
}
MDBX_MAYBE_UNUSED static inline void be_filled(MDBX_cursor *mc) {
cASSERT(mc, mc->top >= 0);
cASSERT(mc, mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
const bool inner = is_inner(mc);
mc->flags &= z_clear_mask;
cASSERT(mc, is_filled(mc));
cASSERT(mc, inner == is_inner(mc));
}
MDBX_MAYBE_UNUSED static inline bool is_related(const MDBX_cursor *base,
const MDBX_cursor *scan) {
cASSERT(base, base->top >= 0);
return base->top <= scan->top && base != scan;
}
/* Флаги контроля/проверки курсора. */
enum cursor_checking {
z_branch = 0x01 /* same as P_BRANCH for check_leaf_type() */,
z_leaf = 0x02 /* same as P_LEAF for check_leaf_type() */,
z_largepage = 0x04 /* same as P_LARGE for check_leaf_type() */,
z_updating = 0x08 /* update/rebalance pending */,
z_ignord = 0x10 /* don't check keys ordering */,
z_dupfix = 0x20 /* same as P_DUPFIX for check_leaf_type() */,
z_retiring = 0x40 /* refs to child pages may be invalid */,
z_pagecheck = 0x80 /* perform page checking, see MDBX_VALIDATION */
};
MDBX_INTERNAL int __must_check_result cursor_check(const MDBX_cursor *mc);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline size_t
cursor_dbi(const MDBX_cursor *mc) {
cASSERT(mc, mc->txn && mc->txn->signature == txn_signature);
size_t dbi = mc->dbi_state - mc->txn->dbi_state;
cASSERT(mc, dbi < mc->txn->env->n_dbi);
return dbi;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
cursor_dbi_changed(const MDBX_cursor *mc) {
return dbi_changed(mc->txn, cursor_dbi(mc));
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t *
cursor_dbi_state(const MDBX_cursor *mc) {
return mc->dbi_state;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
cursor_is_gc(const MDBX_cursor *mc) {
return mc->dbi_state == mc->txn->dbi_state + FREE_DBI;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
cursor_is_main(const MDBX_cursor *mc) {
return mc->dbi_state == mc->txn->dbi_state + MAIN_DBI;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
cursor_is_core(const MDBX_cursor *mc) {
return mc->dbi_state < mc->txn->dbi_state + CORE_DBS;
}
MDBX_MAYBE_UNUSED static inline int cursor_dbi_dbg(const MDBX_cursor *mc) {
/* Debugging output value of a cursor's DBI: Negative for a sub-cursor. */
const int dbi = cursor_dbi(mc);
return (mc->flags & z_inner) ? -dbi : dbi;
}
MDBX_MAYBE_UNUSED static inline int __must_check_result
cursor_push(MDBX_cursor *mc, page_t *mp, indx_t ki) {
TRACE("pushing page %" PRIaPGNO " on db %d cursor %p", mp->pgno,
cursor_dbi_dbg(mc), __Wpedantic_format_voidptr(mc));
if (unlikely(mc->top >= CURSOR_STACK_SIZE - 1)) {
be_poor(mc);
mc->txn->flags |= MDBX_TXN_ERROR;
return MDBX_CURSOR_FULL;
}
mc->top += 1;
mc->pg[mc->top] = mp;
mc->ki[mc->top] = ki;
return MDBX_SUCCESS;
}
MDBX_MAYBE_UNUSED static inline void cursor_pop(MDBX_cursor *mc) {
TRACE("popped page %" PRIaPGNO " off db %d cursor %p", mc->pg[mc->top]->pgno,
cursor_dbi_dbg(mc), __Wpedantic_format_voidptr(mc));
cASSERT(mc, mc->top >= 0);
mc->top -= 1;
}
MDBX_NOTHROW_PURE_FUNCTION static inline bool
check_leaf_type(const MDBX_cursor *mc, const page_t *mp) {
return (((page_type(mp) ^ mc->checking) &
(z_branch | z_leaf | z_largepage | z_dupfix)) == 0);
}
MDBX_INTERNAL void cursor_eot(MDBX_cursor *mc, const bool merge);
MDBX_INTERNAL int cursor_shadow(MDBX_cursor *parent_cursor,
MDBX_txn *nested_txn, const size_t dbi);
MDBX_INTERNAL MDBX_cursor *cursor_cpstk(const MDBX_cursor *csrc,
MDBX_cursor *cdst);
MDBX_INTERNAL int __must_check_result cursor_ops(MDBX_cursor *mc, MDBX_val *key,
MDBX_val *data,
const MDBX_cursor_op op);
MDBX_INTERNAL int __must_check_result cursor_put_checklen(MDBX_cursor *mc,
const MDBX_val *key,
MDBX_val *data,
unsigned flags);
MDBX_INTERNAL int __must_check_result cursor_put(MDBX_cursor *mc,
const MDBX_val *key,
MDBX_val *data,
unsigned flags);
MDBX_INTERNAL int __must_check_result cursor_check_updating(MDBX_cursor *mc);
MDBX_INTERNAL int __must_check_result cursor_del(MDBX_cursor *mc,
unsigned flags);
MDBX_INTERNAL int __must_check_result cursor_sibling_left(MDBX_cursor *mc);
MDBX_INTERNAL int __must_check_result cursor_sibling_right(MDBX_cursor *mc);
typedef struct cursor_set_result {
int err;
bool exact;
} csr_t;
MDBX_INTERNAL csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
MDBX_cursor_op op);
MDBX_INTERNAL int __must_check_result inner_first(MDBX_cursor *__restrict mc,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result inner_last(MDBX_cursor *__restrict mc,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result outer_first(MDBX_cursor *__restrict mc,
MDBX_val *__restrict key,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result outer_last(MDBX_cursor *__restrict mc,
MDBX_val *__restrict key,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result inner_next(MDBX_cursor *__restrict mc,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result inner_prev(MDBX_cursor *__restrict mc,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result outer_next(MDBX_cursor *__restrict mc,
MDBX_val *__restrict key,
MDBX_val *__restrict data,
MDBX_cursor_op op);
MDBX_INTERNAL int __must_check_result outer_prev(MDBX_cursor *__restrict mc,
MDBX_val *__restrict key,
MDBX_val *__restrict data,
MDBX_cursor_op op);
MDBX_INTERNAL int cursor_init4walk(cursor_couple_t *couple,
const MDBX_txn *const txn,
tree_t *const tree, kvx_t *const kvx);
MDBX_INTERNAL int __must_check_result cursor_init(MDBX_cursor *mc,
const MDBX_txn *txn,
size_t dbi);
MDBX_INTERNAL int __must_check_result cursor_dupsort_setup(MDBX_cursor *mc,
const node_t *node,
const page_t *mp);
MDBX_INTERNAL int __must_check_result cursor_touch(MDBX_cursor *const mc,
const MDBX_val *key,
const MDBX_val *data);
/*----------------------------------------------------------------------------*/
/* Update sub-page pointer, if any, in mc->subcur.
* Needed when the node which contains the sub-page may have moved.
* Called with mp = mc->pg[mc->top], ki = mc->ki[mc->top]. */
MDBX_MAYBE_UNUSED static inline void
cursor_inner_refresh(const MDBX_cursor *mc, const page_t *mp, unsigned ki) {
cASSERT(mc, is_leaf(mp));
const node_t *node = page_node(mp, ki);
if ((node_flags(node) & (N_DUPDATA | N_SUBDATA)) == N_DUPDATA)
mc->subcur->cursor.pg[0] = node_data(node);
}
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool cursor_is_tracked(const MDBX_cursor *mc);

954
src/dbi.c Normal file
View File

@ -0,0 +1,954 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
size_t dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi) {
tASSERT(txn, bmi > 0);
bmi &= -bmi;
if (sizeof(txn->dbi_sparse[0]) > 4) {
static const uint8_t debruijn_ctz64[64] = {
0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12};
return debruijn_ctz64[(UINT64_C(0x022FDD63CC95386D) * (uint64_t)bmi) >> 58];
} else {
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(UINT32_C(0x077CB531) * (uint32_t)bmi) >> 27];
}
}
struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) {
eASSERT(env, dbi < env->n_dbi);
struct dbi_snap_result r;
uint32_t snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
do {
r.sequence = snap;
r.flags = env->dbs_flags[dbi];
snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
} while (unlikely(snap != r.sequence));
return r;
}
__noinline int dbi_import(MDBX_txn *txn, const size_t dbi) {
const MDBX_env *const env = txn->env;
if (dbi >= env->n_dbi || !env->dbs_flags[dbi])
return MDBX_BAD_DBI;
#if MDBX_ENABLE_DBI_SPARSE
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
const size_t bitmap_indx = dbi / bitmap_chunk;
const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk;
if (dbi >= txn->n_dbi) {
for (size_t i = (txn->n_dbi + bitmap_chunk - 1) / bitmap_chunk;
bitmap_indx >= i; ++i)
txn->dbi_sparse[i] = 0;
eASSERT(env, (txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0);
MDBX_txn *scan = txn;
do {
eASSERT(env, scan->dbi_sparse == txn->dbi_sparse);
eASSERT(env, scan->n_dbi < dbi + 1);
scan->n_dbi = (unsigned)dbi + 1;
scan->dbi_state[dbi] = 0;
scan = scan->parent;
} while (scan /* && scan->dbi_sparse == txn->dbi_sparse */);
txn->dbi_sparse[bitmap_indx] |= bitmap_mask;
goto lindo;
}
if ((txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0) {
MDBX_txn *scan = txn;
do {
eASSERT(env, scan->dbi_sparse == txn->dbi_sparse);
eASSERT(env, scan->n_dbi == txn->n_dbi);
scan->dbi_state[dbi] = 0;
scan = scan->parent;
} while (scan /* && scan->dbi_sparse == txn->dbi_sparse */);
txn->dbi_sparse[bitmap_indx] |= bitmap_mask;
goto lindo;
}
#else
if (dbi >= txn->n_dbi) {
size_t i = txn->n_dbi;
do
txn->dbi_state[i] = 0;
while (dbi >= ++i);
txn->n_dbi = i;
goto lindo;
}
#endif /* MDBX_ENABLE_DBI_SPARSE */
if (!txn->dbi_state[dbi]) {
lindo:
/* dbi-слот еще не инициализирован в транзакции, а хендл не использовался */
txn->cursors[dbi] = nullptr;
MDBX_txn *const parent = txn->parent;
if (parent) {
/* вложенная пишущая транзакция */
int rc = dbi_check(parent, dbi);
/* копируем состояние subDB очищая new-флаги. */
eASSERT(env, txn->dbi_seqs == parent->dbi_seqs);
txn->dbi_state[dbi] =
parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
if (likely(rc == MDBX_SUCCESS)) {
txn->dbs[dbi] = parent->dbs[dbi];
if (parent->cursors[dbi]) {
rc = cursor_shadow(parent->cursors[dbi], txn, dbi);
if (unlikely(rc != MDBX_SUCCESS)) {
/* не получилось забекапить курсоры */
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE;
txn->flags |= MDBX_TXN_ERROR;
}
}
}
return rc;
}
txn->dbi_seqs[dbi] = 0;
txn->dbi_state[dbi] = DBI_LINDO;
} else {
eASSERT(env, txn->dbi_seqs[dbi] != env->dbi_seqs[dbi].weak);
if (unlikely((txn->dbi_state[dbi] & (DBI_VALID | DBI_OLDEN)) ||
txn->cursors[dbi])) {
/* хендл уже использовался в транзакции, но был закрыт или переоткрыт,
* либо при явном пере-открытии хендла есть висячие курсоры */
eASSERT(env, (txn->dbi_state[dbi] & DBI_STALE) == 0);
txn->dbi_seqs[dbi] = env->dbi_seqs[dbi].weak;
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO;
return txn->cursors[dbi] ? MDBX_DANGLING_DBI : MDBX_BAD_DBI;
}
}
/* хендл не использовался в транзакции, либо явно пере-отрывается при
* отсутствии висячих курсоров */
eASSERT(env, (txn->dbi_state[dbi] & DBI_LINDO) && !txn->cursors[dbi]);
/* читаем актуальные флаги и sequence */
struct dbi_snap_result snap = dbi_snap(env, dbi);
txn->dbi_seqs[dbi] = snap.sequence;
if (snap.flags & DB_VALID) {
txn->dbs[dbi].flags = snap.flags & DB_PERSISTENT_FLAGS;
txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_STALE;
return MDBX_SUCCESS;
}
return MDBX_BAD_DBI;
}
static int defer_and_release(MDBX_env *const env,
defer_free_item_t *const chain) {
size_t length = 0;
defer_free_item_t *obsolete_chain = nullptr;
#if MDBX_ENABLE_DBI_LOCKFREE
const uint64_t now = osal_monotime();
defer_free_item_t **scan = &env->defer_free;
if (env->defer_free) {
const uint64_t threshold_1second = osal_16dot16_to_monotime(1 * 65536);
do {
defer_free_item_t *item = *scan;
if (now - item->timestamp < threshold_1second) {
scan = &item->next;
length += 1;
} else {
*scan = item->next;
item->next = obsolete_chain;
obsolete_chain = item;
}
} while (*scan);
}
eASSERT(env, *scan == nullptr);
if (chain) {
defer_free_item_t *item = chain;
do {
item->timestamp = now;
item = item->next;
} while (item);
*scan = chain;
}
#else /* MDBX_ENABLE_DBI_LOCKFREE */
obsolete_chain = chain;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
ENSURE(env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS);
if (length > 42) {
#if defined(_WIN32) || defined(_WIN64)
SwitchToThread();
#else
sched_yield();
#endif /* Windows */
}
while (obsolete_chain) {
defer_free_item_t *item = obsolete_chain;
obsolete_chain = obsolete_chain->next;
osal_free(item);
}
return chain ? MDBX_SUCCESS : MDBX_BAD_DBI;
}
/* Export or close DBI handles opened in this txn. */
int dbi_update(MDBX_txn *txn, int keep) {
MDBX_env *const env = txn->env;
tASSERT(txn, !txn->parent && txn == env->basal_txn);
bool locked = false;
defer_free_item_t *defer_chain = nullptr;
TXN_FOREACH_DBI_USER(txn, dbi) {
if (likely((txn->dbi_state[dbi] & DBI_CREAT) == 0))
continue;
if (!locked) {
int err = osal_fastmutex_acquire(&env->dbi_lock);
if (unlikely(err != MDBX_SUCCESS))
return err;
locked = true;
if (dbi >= env->n_dbi)
/* хендл был закрыт из другого потока пока захватывали блокировку */
continue;
}
tASSERT(txn, dbi < env->n_dbi);
if (keep) {
env->dbs_flags[dbi] = txn->dbs[dbi].flags | DB_VALID;
} else {
uint32_t seq = dbi_seq_next(env, dbi);
defer_free_item_t *item = env->kvs[dbi].name.iov_base;
if (item) {
env->dbs_flags[dbi] = 0;
env->kvs[dbi].name.iov_len = 0;
env->kvs[dbi].name.iov_base = nullptr;
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
osal_flush_incoherent_cpu_writeback();
item->next = defer_chain;
defer_chain = item;
} else {
eASSERT(env, env->kvs[dbi].name.iov_len == 0);
eASSERT(env, env->dbs_flags[dbi] == 0);
}
}
}
if (locked) {
size_t i = env->n_dbi;
while ((env->dbs_flags[i - 1] & DB_VALID) == 0) {
--i;
eASSERT(env, i >= CORE_DBS);
eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len &&
!env->kvs[i].name.iov_base);
}
env->n_dbi = (unsigned)i;
defer_and_release(env, defer_chain);
}
return MDBX_SUCCESS;
}
int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
const MDBX_env *const env = txn->env;
eASSERT(env, dbi < txn->n_dbi && dbi < env->n_dbi);
eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO);
eASSERT(env, env->dbs_flags[dbi] != DB_POISON);
if ((env->dbs_flags[dbi] & DB_VALID) == 0) {
eASSERT(env, !env->kvs[dbi].clc.k.cmp && !env->kvs[dbi].clc.v.cmp &&
!env->kvs[dbi].name.iov_len &&
!env->kvs[dbi].name.iov_base &&
!env->kvs[dbi].clc.k.lmax && !env->kvs[dbi].clc.k.lmin &&
!env->kvs[dbi].clc.v.lmax && !env->kvs[dbi].clc.v.lmin);
} else {
eASSERT(env, !(txn->dbi_state[dbi] & DBI_VALID) ||
(txn->dbs[dbi].flags | DB_VALID) == env->dbs_flags[dbi]);
eASSERT(env, env->kvs[dbi].name.iov_base || dbi < CORE_DBS);
}
/* Если dbi уже использовался, то корректными считаем четыре варианта:
* 1) user_flags равны MDBX_DB_ACCEDE
* = предполагаем что пользователь открывает существующую subDb,
* при этом код проверки не позволит установить другие компараторы.
* 2) user_flags нулевые, а оба компаратора пустые/нулевые или равны текущим
* = предполагаем что пользователь открывает существующую subDb
* старым способом с нулевыми с флагами по-умолчанию.
* 3) user_flags совпадают, а компараторы не заданы или те же
* = предполагаем что пользователь открывает subDb указывая все параметры;
* 4) user_flags отличаются, но subDb пустая и задан флаг MDBX_CREATE
* = предполагаем что пользователь пересоздает subDb;
*/
if ((user_flags & ~MDBX_CREATE) !=
(unsigned)(env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS)) {
/* flags are differs, check other conditions */
if ((!user_flags && (!keycmp || keycmp == env->kvs[dbi].clc.k.cmp) &&
(!datacmp || datacmp == env->kvs[dbi].clc.v.cmp)) ||
user_flags == MDBX_DB_ACCEDE) {
user_flags = env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS;
} else if ((user_flags & MDBX_CREATE) == 0)
return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
else {
eASSERT(env, env->dbs_flags[dbi] & DB_VALID);
if (txn->dbi_state[dbi] & DBI_STALE) {
int err = sdb_fetch(txn, dbi);
if (unlikely(err == MDBX_SUCCESS))
return err;
}
eASSERT(env,
(txn->dbi_state[dbi] & (DBI_LINDO | DBI_VALID | DBI_STALE)) ==
(DBI_LINDO | DBI_VALID));
if (unlikely(txn->dbs[dbi].leaf_pages))
return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
/* Пересоздаём subDB если там пусто */
if (unlikely(txn->cursors[dbi]))
return MDBX_DANGLING_DBI;
env->dbs_flags[dbi] = DB_POISON;
atomic_store32(&env->dbi_seqs[dbi], dbi_seq_next(env, MAIN_DBI),
mo_AcquireRelease);
const uint32_t seq = dbi_seq_next(env, dbi);
const uint16_t db_flags = user_flags & DB_PERSISTENT_FLAGS;
eASSERT(env, txn->dbs[dbi].height == 0 && txn->dbs[dbi].items == 0 &&
txn->dbs[dbi].root == P_INVALID);
env->kvs[dbi].clc.k.cmp = keycmp ? keycmp : builtin_keycmp(user_flags);
env->kvs[dbi].clc.v.cmp = datacmp ? datacmp : builtin_datacmp(user_flags);
txn->dbs[dbi].flags = db_flags;
txn->dbs[dbi].dupfix_size = 0;
if (unlikely(sdb_setup(env, &env->kvs[dbi], &txn->dbs[dbi]))) {
txn->dbi_state[dbi] = DBI_LINDO;
txn->flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
env->dbs_flags[dbi] = db_flags | DB_VALID;
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
txn->dbi_seqs[dbi] = seq;
txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_CREAT | DBI_DIRTY;
txn->flags |= MDBX_TXN_DIRTY;
}
}
if (!keycmp)
keycmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.k.cmp
: builtin_keycmp(user_flags);
if (env->kvs[dbi].clc.k.cmp != keycmp) {
if (env->dbs_flags[dbi] & DB_VALID)
return MDBX_EINVAL;
env->kvs[dbi].clc.k.cmp = keycmp;
}
if (!datacmp)
datacmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.v.cmp
: builtin_datacmp(user_flags);
if (env->kvs[dbi].clc.v.cmp != datacmp) {
if (env->dbs_flags[dbi] & DB_VALID)
return MDBX_EINVAL;
env->kvs[dbi].clc.v.cmp = datacmp;
}
return MDBX_SUCCESS;
}
static inline size_t dbi_namelen(const MDBX_val name) {
return (name.iov_len > sizeof(defer_free_item_t)) ? name.iov_len
: sizeof(defer_free_item_t);
}
static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp,
MDBX_val name) {
MDBX_env *const env = txn->env;
/* Cannot mix named table(s) with DUPSORT flags */
tASSERT(txn,
(txn->dbi_state[MAIN_DBI] & (DBI_LINDO | DBI_VALID | DBI_STALE)) ==
(DBI_LINDO | DBI_VALID));
if (unlikely(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT)) {
if (unlikely((user_flags & MDBX_CREATE) == 0))
return MDBX_NOTFOUND;
if (unlikely(txn->dbs[MAIN_DBI].leaf_pages))
/* В MainDB есть записи, либо она уже использовалась. */
return MDBX_INCOMPATIBLE;
/* Пересоздаём MainDB когда там пусто. */
tASSERT(txn, txn->dbs[MAIN_DBI].height == 0 &&
txn->dbs[MAIN_DBI].items == 0 &&
txn->dbs[MAIN_DBI].root == P_INVALID);
if (unlikely(txn->cursors[MAIN_DBI]))
return MDBX_DANGLING_DBI;
env->dbs_flags[MAIN_DBI] = DB_POISON;
atomic_store32(&env->dbi_seqs[MAIN_DBI], dbi_seq_next(env, MAIN_DBI),
mo_AcquireRelease);
const uint32_t seq = dbi_seq_next(env, MAIN_DBI);
const uint16_t main_flags =
txn->dbs[MAIN_DBI].flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY);
env->kvs[MAIN_DBI].clc.k.cmp = builtin_keycmp(main_flags);
env->kvs[MAIN_DBI].clc.v.cmp = builtin_datacmp(main_flags);
txn->dbs[MAIN_DBI].flags = main_flags;
txn->dbs[MAIN_DBI].dupfix_size = 0;
int err = sdb_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]);
if (unlikely(err != MDBX_SUCCESS)) {
txn->dbi_state[MAIN_DBI] = DBI_LINDO;
txn->flags |= MDBX_TXN_ERROR;
env->flags |= ENV_FATAL_ERROR;
return err;
}
env->dbs_flags[MAIN_DBI] = main_flags | DB_VALID;
txn->dbi_seqs[MAIN_DBI] =
atomic_store32(&env->dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease);
txn->dbi_state[MAIN_DBI] |= DBI_DIRTY;
txn->flags |= MDBX_TXN_DIRTY;
}
tASSERT(txn, env->kvs[MAIN_DBI].clc.k.cmp);
/* Is the DB already open? */
size_t slot = env->n_dbi;
for (size_t scan = CORE_DBS; scan < env->n_dbi; ++scan) {
if ((env->dbs_flags[scan] & DB_VALID) == 0) {
/* Remember this free slot */
slot = (slot < scan) ? slot : scan;
continue;
}
if (!env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[scan].name)) {
slot = scan;
int err = dbi_check(txn, slot);
if (err == MDBX_BAD_DBI &&
txn->dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) {
/* хендл использовался, стал невалидным,
* но теперь явно пере-открывается в этой транзакци */
eASSERT(env, !txn->cursors[slot]);
txn->dbi_state[slot] = DBI_LINDO;
err = dbi_check(txn, slot);
}
if (err == MDBX_SUCCESS) {
err = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (likely(err == MDBX_SUCCESS)) {
goto done;
}
}
return err;
}
}
/* Fail, if no free slot and max hit */
if (unlikely(slot >= env->max_dbi))
return MDBX_DBS_FULL;
if (env->n_dbi == slot)
eASSERT(env, !env->dbs_flags[slot] && !env->kvs[slot].name.iov_len &&
!env->kvs[slot].name.iov_base);
env->dbs_flags[slot] = DB_POISON;
atomic_store32(&env->dbi_seqs[slot], dbi_seq_next(env, slot),
mo_AcquireRelease);
memset(&env->kvs[slot], 0, sizeof(env->kvs[slot]));
if (env->n_dbi == slot)
env->n_dbi = (unsigned)slot + 1;
eASSERT(env, slot < env->n_dbi);
int err = dbi_check(txn, slot);
eASSERT(env, err == MDBX_BAD_DBI);
if (err != MDBX_BAD_DBI)
return MDBX_PROBLEM;
/* Find the DB info */
MDBX_val body;
cursor_couple_t cx;
int rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = cursor_seek(&cx.outer, &name, &body, MDBX_SET).err;
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE))
return rc;
} else {
/* make sure this is actually a table */
node_t *node =
page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
if (unlikely((node_flags(node) & (N_DUPDATA | N_SUBDATA)) != N_SUBDATA))
return MDBX_INCOMPATIBLE;
if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(tree_t))) {
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid subDb node size", body.iov_len);
return MDBX_CORRUPTED;
}
memcpy(&txn->dbs[slot], body.iov_base, sizeof(tree_t));
}
/* Done here so we cannot fail after creating a new DB */
void *clone = nullptr;
if (name.iov_len) {
clone = osal_malloc(dbi_namelen(name));
if (unlikely(!clone))
return MDBX_ENOMEM;
name.iov_base = memcpy(clone, name.iov_base, name.iov_len);
} else
name.iov_base = "";
uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH;
if (unlikely(rc)) {
/* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */
tASSERT(txn, rc == MDBX_NOTFOUND);
body.iov_base = memset(&txn->dbs[slot], 0, body.iov_len = sizeof(tree_t));
txn->dbs[slot].root = P_INVALID;
txn->dbs[slot].mod_txnid = txn->txnid;
txn->dbs[slot].flags = user_flags & DB_PERSISTENT_FLAGS;
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
rc = cursor_put_checklen(&cx.outer, &name, &body,
N_SUBDATA | MDBX_NOOVERWRITE);
txn->cursors[MAIN_DBI] = cx.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
dbi_state |= DBI_DIRTY | DBI_CREAT;
txn->flags |= MDBX_TXN_DIRTY;
tASSERT(txn, (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) != 0);
}
/* Got info, register DBI in this txn */
const uint32_t seq = dbi_seq_next(env, slot);
eASSERT(env,
env->dbs_flags[slot] == DB_POISON && !txn->cursors[slot] &&
(txn->dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO);
txn->dbi_state[slot] = dbi_state;
memcpy(&txn->dbs[slot], body.iov_base, sizeof(txn->dbs[slot]));
env->dbs_flags[slot] = txn->dbs[slot].flags;
rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
env->kvs[slot].name = name;
env->dbs_flags[slot] = txn->dbs[slot].flags | DB_VALID;
txn->dbi_seqs[slot] =
atomic_store32(&env->dbi_seqs[slot], seq, mo_AcquireRelease);
done:
*dbi = (MDBX_dbi)slot;
tASSERT(txn, slot < txn->n_dbi && (env->dbs_flags[slot] & DB_VALID) != 0);
eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS);
return MDBX_SUCCESS;
bailout:
eASSERT(env, !txn->cursors[slot] && !env->kvs[slot].name.iov_len &&
!env->kvs[slot].name.iov_base);
txn->dbi_state[slot] &= DBI_LINDO | DBI_OLDEN;
env->dbs_flags[slot] = 0;
osal_free(clone);
if (slot + 1 == env->n_dbi)
txn->n_dbi = env->n_dbi = (unsigned)slot;
return rc;
}
int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags,
MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
if (unlikely(!dbi))
return MDBX_EINVAL;
*dbi = 0;
if (user_flags != MDBX_ACCEDE &&
unlikely(!check_sdb_flags(user_flags & ~MDBX_CREATE)))
return MDBX_EINVAL;
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if ((user_flags & MDBX_CREATE) && unlikely(txn->flags & MDBX_TXN_RDONLY))
return MDBX_EACCESS;
/* main table? */
if (unlikely(name == MDBX_CHK_MAIN || name->iov_base == MDBX_CHK_MAIN)) {
rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp);
if (likely(rc == MDBX_SUCCESS))
*dbi = MAIN_DBI;
return rc;
}
if (unlikely(name == MDBX_CHK_GC || name->iov_base == MDBX_CHK_GC)) {
rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp);
if (likely(rc == MDBX_SUCCESS))
*dbi = FREE_DBI;
return rc;
}
if (unlikely(name == MDBX_CHK_META || name->iov_base == MDBX_CHK_META))
return MDBX_EINVAL;
if (unlikely(name->iov_len >
txn->env->leaf_nodemax - NODESIZE - sizeof(tree_t)))
return MDBX_EINVAL;
#if MDBX_ENABLE_DBI_LOCKFREE
/* Is the DB already open? */
const MDBX_env *const env = txn->env;
size_t free_slot = env->n_dbi;
for (size_t i = CORE_DBS; i < env->n_dbi; ++i) {
retry:
if ((env->dbs_flags[i] & DB_VALID) == 0) {
free_slot = i;
continue;
}
const uint32_t snap_seq =
atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease);
const uint16_t snap_flags = env->dbs_flags[i];
const MDBX_val snap_name = env->kvs[i].name;
if (user_flags != MDBX_ACCEDE &&
(((user_flags ^ snap_flags) & DB_PERSISTENT_FLAGS) ||
(keycmp && keycmp != env->kvs[i].clc.k.cmp) ||
(datacmp && datacmp != env->kvs[i].clc.v.cmp)))
continue;
const uint32_t main_seq =
atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease);
MDBX_cmp_func *const snap_cmp = env->kvs[MAIN_DBI].clc.k.cmp;
if (unlikely(!(snap_flags & DB_VALID) || !snap_name.iov_base ||
!snap_name.iov_len || !snap_cmp))
continue;
const bool name_match = snap_cmp(&snap_name, name) == 0;
osal_flush_incoherent_cpu_writeback();
if (unlikely(
snap_seq != atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease) ||
main_seq !=
atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease) ||
snap_flags != env->dbs_flags[i] ||
snap_name.iov_base != env->kvs[i].name.iov_base ||
snap_name.iov_len != env->kvs[i].name.iov_len))
goto retry;
if (name_match) {
rc = dbi_check(txn, i);
if (rc == MDBX_BAD_DBI && txn->dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) {
/* хендл использовался, стал невалидным,
* но теперь явно пере-открывается в этой транзакци */
eASSERT(env, !txn->cursors[i]);
txn->dbi_state[i] = DBI_LINDO;
rc = dbi_check(txn, i);
}
if (likely(rc == MDBX_SUCCESS)) {
rc = dbi_bind(txn, i, user_flags, keycmp, datacmp);
if (likely(rc == MDBX_SUCCESS))
*dbi = (MDBX_dbi)i;
}
return rc;
}
}
/* Fail, if no free slot and max hit */
if (unlikely(free_slot >= env->max_dbi))
return MDBX_DBS_FULL;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
rc = osal_fastmutex_acquire(&txn->env->dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name);
ENSURE(txn->env,
osal_fastmutex_release(&txn->env->dbi_lock) == MDBX_SUCCESS);
}
return rc;
}
static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr,
MDBX_db_flags_t flags, MDBX_dbi *dbi,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
MDBX_val thunk, *name;
if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC ||
name_cstr == MDBX_CHK_META)
name = (void *)name_cstr;
else {
thunk.iov_len = strlen(name_cstr);
thunk.iov_base = (void *)name_cstr;
name = &thunk;
}
return dbi_open(txn, name, flags, dbi, keycmp, datacmp);
}
struct dbi_rename_result {
defer_free_item_t *defer;
int err;
};
__cold static struct dbi_rename_result
dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) {
struct dbi_rename_result pair;
pair.defer = nullptr;
pair.err = dbi_check(txn, dbi);
if (unlikely(pair.err != MDBX_SUCCESS))
return pair;
MDBX_env *const env = txn->env;
MDBX_val old_name = env->kvs[dbi].name;
if (env->kvs[MAIN_DBI].clc.k.cmp(&new_name, &old_name) == 0 &&
MDBX_DEBUG == 0)
return pair;
cursor_couple_t cx;
pair.err = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(pair.err != MDBX_SUCCESS))
return pair;
pair.err = cursor_seek(&cx.outer, &new_name, nullptr, MDBX_SET).err;
if (unlikely(pair.err != MDBX_NOTFOUND)) {
pair.err = (pair.err == MDBX_SUCCESS) ? MDBX_KEYEXIST : pair.err;
return pair;
}
pair.defer = osal_malloc(dbi_namelen(new_name));
if (unlikely(!pair.defer)) {
pair.err = MDBX_ENOMEM;
return pair;
}
new_name.iov_base = memcpy(pair.defer, new_name.iov_base, new_name.iov_len);
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
MDBX_val data = {&txn->dbs[dbi], sizeof(tree_t)};
pair.err = cursor_put_checklen(&cx.outer, &new_name, &data,
N_SUBDATA | MDBX_NOOVERWRITE);
if (likely(pair.err == MDBX_SUCCESS)) {
pair.err = cursor_seek(&cx.outer, &old_name, nullptr, MDBX_SET).err;
if (likely(pair.err == MDBX_SUCCESS))
pair.err = cursor_del(&cx.outer, N_SUBDATA);
if (likely(pair.err == MDBX_SUCCESS)) {
pair.defer = env->kvs[dbi].name.iov_base;
env->kvs[dbi].name = new_name;
} else
txn->flags |= MDBX_TXN_ERROR;
}
txn->cursors[MAIN_DBI] = cx.outer.next;
return pair;
}
static defer_free_item_t *dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) {
eASSERT(env, dbi >= CORE_DBS);
if (unlikely(dbi >= env->n_dbi))
return nullptr;
const uint32_t seq = dbi_seq_next(env, dbi);
defer_free_item_t *defer_item = env->kvs[dbi].name.iov_base;
if (likely(defer_item)) {
env->dbs_flags[dbi] = 0;
env->kvs[dbi].name.iov_len = 0;
env->kvs[dbi].name.iov_base = nullptr;
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
osal_flush_incoherent_cpu_writeback();
defer_item->next = nullptr;
if (env->n_dbi == dbi + 1) {
size_t i = env->n_dbi;
do {
--i;
eASSERT(env, i >= CORE_DBS);
eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len &&
!env->kvs[i].name.iov_base);
} while (i > CORE_DBS && !env->kvs[i - 1].name.iov_base);
env->n_dbi = (unsigned)i;
}
}
return defer_item;
}
/*----------------------------------------------------------------------------*/
/* API */
int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags,
MDBX_dbi *dbi) {
return dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr);
}
int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags,
MDBX_dbi *dbi) {
return dbi_open(txn, name, flags, dbi, nullptr, nullptr);
}
int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags,
MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp) {
return dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp);
}
int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name,
MDBX_db_flags_t flags, MDBX_dbi *dbi,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
return dbi_open(txn, name, flags, dbi, keycmp, datacmp);
}
__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (txn->dbs[dbi].height) {
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
rc = tree_drop(&cx.outer,
dbi == MAIN_DBI || (cx.outer.tree->flags & MDBX_DUPSORT));
txn->cursors[dbi] = cx.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
/* Invalidate the dropped DB's cursors */
for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next)
be_poor(mc);
if (!del || dbi < CORE_DBS) {
/* reset the DB record, mark it dirty */
txn->dbi_state[dbi] |= DBI_DIRTY;
txn->dbs[dbi].height = 0;
txn->dbs[dbi].branch_pages = 0;
txn->dbs[dbi].leaf_pages = 0;
txn->dbs[dbi].large_pages = 0;
txn->dbs[dbi].items = 0;
txn->dbs[dbi].root = P_INVALID;
txn->dbs[dbi].sequence = 0;
/* txn->dbs[dbi].mod_txnid = txn->txnid; */
txn->flags |= MDBX_TXN_DIRTY;
return MDBX_SUCCESS;
}
MDBX_env *const env = txn->env;
MDBX_val name = env->kvs[dbi].name;
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (likely(rc == MDBX_SUCCESS)) {
rc = cursor_seek(&cx.outer, &name, nullptr, MDBX_SET).err;
if (likely(rc == MDBX_SUCCESS)) {
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
rc = cursor_del(&cx.outer, N_SUBDATA);
txn->cursors[MAIN_DBI] = cx.outer.next;
if (likely(rc == MDBX_SUCCESS)) {
tASSERT(txn, txn->dbi_state[MAIN_DBI] & DBI_DIRTY);
tASSERT(txn, txn->flags & MDBX_TXN_DIRTY);
txn->dbi_state[dbi] = DBI_LINDO | DBI_OLDEN;
rc = osal_fastmutex_acquire(&env->dbi_lock);
if (likely(rc == MDBX_SUCCESS))
return defer_and_release(env, dbi_close_locked(env, dbi));
}
}
}
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) {
MDBX_val thunk, *name;
if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC ||
name_cstr == MDBX_CHK_META)
name = (void *)name_cstr;
else {
thunk.iov_len = strlen(name_cstr);
thunk.iov_base = (void *)name_cstr;
name = &thunk;
}
return mdbx_dbi_rename2(txn, dbi, name);
}
int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(dbi < CORE_DBS))
return (dbi == MAIN_DBI) ? MDBX_SUCCESS : MDBX_BAD_DBI;
if (unlikely(dbi >= env->max_dbi))
return MDBX_BAD_DBI;
if (unlikely(dbi < CORE_DBS || dbi >= env->max_dbi))
return MDBX_BAD_DBI;
rc = osal_fastmutex_acquire(&env->dbi_lock);
if (likely(rc == MDBX_SUCCESS))
rc = defer_and_release(env, dbi_close_locked(env, dbi));
return rc;
}
int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags,
unsigned *state) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!flags || !state))
return MDBX_EINVAL;
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
*flags = txn->dbs[dbi].flags & DB_PERSISTENT_FLAGS;
*state =
txn->dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE);
return MDBX_SUCCESS;
}
__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi,
const MDBX_val *new_name) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(new_name == MDBX_CHK_MAIN ||
new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC ||
new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META ||
new_name->iov_base == MDBX_CHK_META))
return MDBX_EINVAL;
if (unlikely(dbi < CORE_DBS))
return MDBX_EINVAL;
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = osal_fastmutex_acquire(&txn->env->dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name);
if (pair.defer)
pair.defer->next = nullptr;
defer_and_release(txn->env, pair.defer);
rc = pair.err;
}
return rc;
}
static void stat_get(const tree_t *db, MDBX_stat *st, size_t bytes) {
st->ms_depth = db->height;
st->ms_branch_pages = db->branch_pages;
st->ms_leaf_pages = db->leaf_pages;
st->ms_overflow_pages = db->large_pages;
st->ms_entries = db->items;
if (likely(bytes >=
offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
st->ms_mod_txnid = db->mod_txnid;
}
__cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest,
size_t bytes) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!dest))
return MDBX_EINVAL;
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
return MDBX_EINVAL;
if (unlikely(txn->flags & MDBX_TXN_BLOCKED))
return MDBX_BAD_TXN;
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
rc = sdb_fetch((MDBX_txn *)txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
dest->ms_psize = txn->env->ps;
stat_get(&txn->dbs[dbi], dest, bytes);
return MDBX_SUCCESS;
}

133
src/dbi.h Normal file
View File

@ -0,0 +1,133 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL size_t
dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi);
#if MDBX_ENABLE_DBI_SPARSE
static inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) {
tASSERT(txn, bmi > 0);
STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->dbi_sparse[0]));
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
if (sizeof(txn->dbi_sparse[0]) <= sizeof(int))
return __builtin_ctz((int)bmi);
if (sizeof(txn->dbi_sparse[0]) == sizeof(long))
return __builtin_ctzl((long)bmi);
#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \
__has_builtin(__builtin_ctzll)
return __builtin_ctzll(bmi);
#endif /* have(long long) && long long == uint64_t */
#endif /* GNU C */
#if defined(_MSC_VER)
unsigned long index;
if (sizeof(txn->dbi_sparse[0]) > 4) {
#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
_BitScanForward64(&index, bmi);
return index;
#else
if (bmi > UINT32_MAX) {
_BitScanForward(&index, (uint32_t)((uint64_t)bmi >> 32));
return index;
}
#endif
}
_BitScanForward(&index, (uint32_t)bmi);
return index;
#endif /* MSVC */
return dbi_bitmap_ctz_fallback(txn, bmi);
}
/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность
* использования оператора break */
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->dbi_sparse[0]), \
bitmap_item = TXN->dbi_sparse[0] >> FROM, I = FROM; \
I < TXN->n_dbi; ++I) \
if (bitmap_item == 0) { \
I = (I - 1) | (bitmap_chunk - 1); \
bitmap_item = TXN->dbi_sparse[(1 + I) / bitmap_chunk]; \
if (!bitmap_item) \
I += bitmap_chunk; \
continue; \
} else if ((bitmap_item & 1) == 0) { \
size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \
bitmap_item >>= bitmap_skip; \
I += bitmap_skip - 1; \
continue; \
} else if (bitmap_item >>= 1, TXN->dbi_state[I])
#else
#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \
for (size_t I = SKIP; I < TXN->n_dbi; ++I) \
if (TXN->dbi_state[I])
#endif /* MDBX_ENABLE_DBI_SPARSE */
#define TXN_FOREACH_DBI_ALL(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, 0)
#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS)
MDBX_INTERNAL int dbi_import(MDBX_txn *txn, const size_t dbi);
struct dbi_snap_result {
uint32_t sequence;
unsigned flags;
};
MDBX_INTERNAL struct dbi_snap_result dbi_snap(const MDBX_env *env,
const size_t dbi);
MDBX_INTERNAL int dbi_update(MDBX_txn *txn, int keep);
static inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) {
STATIC_ASSERT(
(int)DBI_DIRTY == MDBX_DBI_DIRTY && (int)DBI_STALE == MDBX_DBI_STALE &&
(int)DBI_FRESH == MDBX_DBI_FRESH && (int)DBI_CREAT == MDBX_DBI_CREAT);
#if MDBX_ENABLE_DBI_SPARSE
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
const size_t bitmap_indx = dbi / bitmap_chunk;
const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk;
return likely(dbi < txn->n_dbi &&
(txn->dbi_sparse[bitmap_indx] & bitmap_mask) != 0)
? txn->dbi_state[dbi]
: 0;
#else
return likely(dbi < txn->n_dbi) ? txn->dbi_state[dbi] : 0;
#endif /* MDBX_ENABLE_DBI_SPARSE */
}
static inline bool dbi_changed(const MDBX_txn *txn, const size_t dbi) {
const MDBX_env *const env = txn->env;
eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO);
const uint32_t snap_seq =
atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
return snap_seq != txn->dbi_seqs[dbi];
}
static inline int dbi_check(const MDBX_txn *txn, const size_t dbi) {
const uint8_t state = dbi_state(txn, dbi);
if (likely((state & DBI_LINDO) != 0 && !dbi_changed(txn, dbi)))
return (state & DBI_VALID) ? MDBX_SUCCESS : MDBX_BAD_DBI;
/* Медленный путь: ленивая до-инициализацяи и импорт */
return dbi_import((MDBX_txn *)txn, dbi);
}
static inline uint32_t dbi_seq_next(const MDBX_env *const env, size_t dbi) {
uint32_t v = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease) + 1;
return v ? v : 1;
}
MDBX_INTERNAL int dbi_open(MDBX_txn *txn, const MDBX_val *const name,
unsigned user_flags, MDBX_dbi *dbi,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp);
MDBX_INTERNAL int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp);

520
src/dpl.c Normal file
View File

@ -0,0 +1,520 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
static inline size_t dpl_size2bytes(ptrdiff_t size) {
assert(size > CURSOR_STACK_SIZE && (size_t)size <= PAGELIST_LIMIT);
#if MDBX_DPL_PREALLOC_FOR_RADIXSORT
size += size;
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) +
(PAGELIST_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1)) *
sizeof(dp_t) +
MDBX_PNL_GRANULATE * sizeof(void *) * 2 <
SIZE_MAX / 4 * 3);
size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) +
size * sizeof(dp_t),
MDBX_PNL_GRANULATE * sizeof(void *) * 2) -
MDBX_ASSUME_MALLOC_OVERHEAD;
return bytes;
}
static inline size_t dpl_bytes2size(const ptrdiff_t bytes) {
size_t size = (bytes - sizeof(dpl_t)) / sizeof(dp_t);
assert(size > CURSOR_STACK_SIZE &&
size <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
#if MDBX_DPL_PREALLOC_FOR_RADIXSORT
size >>= 1;
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
return size;
}
void dpl_free(MDBX_txn *txn) {
if (likely(txn->tw.dirtylist)) {
osal_free(txn->tw.dirtylist);
txn->tw.dirtylist = nullptr;
}
}
dpl_t *dpl_reserve(MDBX_txn *txn, size_t size) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
size_t bytes =
dpl_size2bytes((size < PAGELIST_LIMIT) ? size : PAGELIST_LIMIT);
dpl_t *const dl = osal_realloc(txn->tw.dirtylist, bytes);
if (likely(dl)) {
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
bytes = malloc_usable_size(dl);
#endif /* malloc_usable_size */
dl->detent = dpl_bytes2size(bytes);
tASSERT(txn, txn->tw.dirtylist == nullptr || dl->length <= dl->detent);
txn->tw.dirtylist = dl;
}
return dl;
}
int dpl_alloc(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const size_t wanna = (txn->env->options.dp_initial < txn->geo.upper)
? txn->env->options.dp_initial
: txn->geo.upper;
#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG
if (txn->tw.dirtylist)
/* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */
txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0;
#endif /* asertions enabled */
if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna ||
txn->tw.dirtylist->detent > wanna + wanna) &&
unlikely(!dpl_reserve(txn, wanna)))
return MDBX_ENOMEM;
dpl_clear(txn->tw.dirtylist);
return MDBX_SUCCESS;
}
#define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno)
RADIXSORT_IMPL(dp, dp_t, MDBX_DPL_EXTRACT_KEY, MDBX_DPL_PREALLOC_FOR_RADIXSORT,
1)
#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno)
SORT_IMPL(dp_sort, false, dp_t, DP_SORT_CMP)
__hot __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->tw.dirtylist;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
const size_t unsorted = dl->length - dl->sorted;
if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) ||
unlikely(!dp_radixsort(dl->items + 1, dl->length))) {
if (dl->sorted > unsorted / 4 + 4 &&
(MDBX_DPL_PREALLOC_FOR_RADIXSORT ||
dl->length + unsorted < dl->detent + dpl_gap_mergesort)) {
dp_t *const sorted_begin = dl->items + 1;
dp_t *const sorted_end = sorted_begin + dl->sorted;
dp_t *const end = dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT
? dl->length + dl->length + 1
: dl->detent + dpl_reserve_gap);
dp_t *const tmp = end - unsorted;
assert(dl->items + dl->length + 1 < tmp);
/* copy unsorted to the end of allocated space and sort it */
memcpy(tmp, sorted_end, unsorted * sizeof(dp_t));
dp_sort(tmp, tmp + unsorted);
/* merge two parts from end to begin */
dp_t *__restrict w = dl->items + dl->length;
dp_t *__restrict l = dl->items + dl->sorted;
dp_t *__restrict r = end - 1;
do {
const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5);
#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV
*w = cmp ? *l-- : *r--;
#else
*w = cmp ? *l : *r;
l -= cmp;
r += (ptrdiff_t)cmp - 1;
#endif
} while (likely(--w > l));
assert(r == tmp - 1);
assert(dl->items[0].pgno == 0 &&
dl->items[dl->length + 1].pgno == P_INVALID);
if (ASSERT_ENABLED())
for (size_t i = 0; i <= dl->length; ++i)
assert(dl->items[i].pgno < dl->items[i + 1].pgno);
} else {
dp_sort(dl->items + 1, dl->items + dl->length + 1);
assert(dl->items[0].pgno == 0 &&
dl->items[dl->length + 1].pgno == P_INVALID);
}
} else {
assert(dl->items[0].pgno == 0 &&
dl->items[dl->length + 1].pgno == P_INVALID);
}
dl->sorted = dl->length;
return dl;
}
/* Returns the index of the first dirty-page whose pgno
* member is greater than or equal to id. */
#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id))
SEARCH_IMPL(dp_bsearch, dp_t, pgno_t, DP_SEARCH_CMP)
__hot __noinline MDBX_INTERNAL size_t dpl_search(const MDBX_txn *txn,
pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->tw.dirtylist;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
if (AUDIT_ENABLED()) {
for (const dp_t *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
assert(ptr[0].pgno < ptr[1].pgno);
assert(ptr[0].pgno >= NUM_METAS);
}
}
switch (dl->length - dl->sorted) {
default:
/* sort a whole */
dpl_sort_slowpath(txn);
break;
case 0:
/* whole sorted cases */
break;
#define LINEAR_SEARCH_CASE(N) \
case N: \
if (dl->items[dl->length - N + 1].pgno == pgno) \
return dl->length - N + 1; \
__fallthrough
/* use linear scan until the threshold */
LINEAR_SEARCH_CASE(7); /* fall through */
LINEAR_SEARCH_CASE(6); /* fall through */
LINEAR_SEARCH_CASE(5); /* fall through */
LINEAR_SEARCH_CASE(4); /* fall through */
LINEAR_SEARCH_CASE(3); /* fall through */
LINEAR_SEARCH_CASE(2); /* fall through */
case 1:
if (dl->items[dl->length].pgno == pgno)
return dl->length;
/* continue bsearch on the sorted part */
break;
}
return dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items;
}
const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const dpl_t *dl = txn->tw.dirtylist;
if (dl) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
assert(dl->items[0].pgno == 0 &&
dl->items[dl->length + 1].pgno == P_INVALID);
for (size_t i = dl->length; i > dl->sorted; --i)
if (dl->items[i].pgno == pgno)
return dl->items[i].ptr;
if (dl->sorted) {
const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items;
if (dl->items[i].pgno == pgno)
return dl->items[i].ptr;
}
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
}
return nullptr;
}
void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->tw.dirtylist;
assert((intptr_t)i > 0 && i <= dl->length);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
dl->pages_including_loose -= npages;
dl->sorted -= dl->sorted >= i;
dl->length -= 1;
memmove(dl->items + i, dl->items + i + 1,
(dl->length - i + 2) * sizeof(dl->items[0]));
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
}
int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page,
size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const dp_t dp = {page, pgno, (pgno_t)npages};
if ((txn->flags & MDBX_WRITEMAP) == 0) {
size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
}
dpl_t *dl = txn->tw.dirtylist;
tASSERT(txn, dl->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
tASSERT(txn, dl->items[0].pgno == 0 &&
dl->items[dl->length + 1].pgno == P_INVALID);
if (AUDIT_ENABLED()) {
for (size_t i = dl->length; i > 0; --i) {
assert(dl->items[i].pgno != dp.pgno);
if (unlikely(dl->items[i].pgno == dp.pgno)) {
ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i);
return MDBX_PROBLEM;
}
}
}
if (unlikely(dl->length == dl->detent)) {
if (unlikely(dl->detent >= PAGELIST_LIMIT)) {
ERROR("DPL is full (PAGELIST_LIMIT %zu)", PAGELIST_LIMIT);
return MDBX_TXN_FULL;
}
const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42)
? dl->detent + dl->detent
: dl->detent + dl->detent / 2;
dl = dpl_reserve(txn, size);
if (unlikely(!dl))
return MDBX_ENOMEM;
tASSERT(txn, dl->length < dl->detent);
}
/* Сортировка нужна для быстрого поиска, используем несколько тактик:
* 1) Сохраняем упорядоченность при естественной вставке в нужном порядке.
* 2) Добавляем в не-сортированный хвост, который сортируем и сливаем
* с отсортированной головой по необходимости, а пока хвост короткий
* ищем в нём сканированием, избегая большой пересортировки.
* 3) Если не-сортированный хвост короткий, а добавляемый элемент близок
* к концу отсортированной головы, то выгоднее сразу вставить элемент
* в нужное место.
*
* Алгоритмически:
* - добавлять в не-сортированный хвост следует только если вставка сильно
* дорогая, т.е. если целевая позиция элемента сильно далека от конца;
* - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим
* от конца на максимально-приемлемое расстояние;
* - если список короче, либо элемент в этой позиции меньше вставляемого,
* то следует перемещать элементы и вставлять в отсортированную голову;
* - если не-сортированный хвост длиннее, либо элемент в этой позиции больше,
* то следует добавлять в не-сортированный хвост. */
dl->pages_including_loose += npages;
dp_t *i = dl->items + dl->length;
const ptrdiff_t pivot = (ptrdiff_t)dl->length - dpl_insertion_threshold;
#if MDBX_HAVE_CMOV
const pgno_t pivot_pgno =
dl->items[(dl->length < dpl_insertion_threshold)
? 0
: dl->length - dpl_insertion_threshold]
.pgno;
#endif /* MDBX_HAVE_CMOV */
/* copy the stub beyond the end */
i[2] = i[1];
dl->length += 1;
if (likely(pivot <= (ptrdiff_t)dl->sorted) &&
#if MDBX_HAVE_CMOV
pivot_pgno < dp.pgno) {
#else
(pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) {
#endif /* MDBX_HAVE_CMOV */
dl->sorted += 1;
/* сдвигаем несортированный хвост */
while (i >= dl->items + dl->sorted) {
#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */
i[1] = *i;
#elif MDBX_WORDBITS == 64 && \
(defined(__SIZEOF_INT128__) || \
(defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128))
STATIC_ASSERT(sizeof(dp) == sizeof(__uint128_t));
((__uint128_t *)i)[1] = *(volatile __uint128_t *)i;
#else
i[1].ptr = i->ptr;
i[1].pgno = i->pgno;
i[1].npages = i->npages;
#endif
--i;
}
/* ищем нужную позицию сдвигая отсортированные элементы */
while (i->pgno > pgno) {
tASSERT(txn, i > dl->items);
i[1] = *i;
--i;
}
tASSERT(txn, i->pgno < dp.pgno);
}
i[1] = dp;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
assert(dl->sorted <= dl->length);
return MDBX_SUCCESS;
}
__cold bool dpl_check(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const dpl_t *const dl = txn->tw.dirtylist;
if (!dl) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
return true;
}
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
tASSERT(txn, txn->tw.dirtyroom + dl->length ==
(txn->parent ? txn->parent->tw.dirtyroom
: txn->env->options.dp_limit));
if (!AUDIT_ENABLED())
return true;
size_t loose = 0, pages = 0;
for (size_t i = dl->length; i > 0; --i) {
const page_t *const dp = dl->items[i].ptr;
if (!dp)
continue;
tASSERT(txn, dp->pgno == dl->items[i].pgno);
if (unlikely(dp->pgno != dl->items[i].pgno))
return false;
if ((txn->flags & MDBX_WRITEMAP) == 0) {
const uint32_t age = dpl_age(txn, i);
tASSERT(txn, age < UINT32_MAX / 3);
if (unlikely(age > UINT32_MAX / 3))
return false;
}
tASSERT(txn, dp->flags == P_LOOSE || is_modifable(txn, dp));
if (dp->flags == P_LOOSE) {
loose += 1;
} else if (unlikely(!is_modifable(txn, dp)))
return false;
const unsigned num = dpl_npages(dl, i);
pages += num;
tASSERT(txn, txn->geo.first_unallocated >= dp->pgno + num);
if (unlikely(txn->geo.first_unallocated < dp->pgno + num))
return false;
if (i < dl->sorted) {
tASSERT(txn, dl->items[i + 1].pgno >= dp->pgno + num);
if (unlikely(dl->items[i + 1].pgno < dp->pgno + num))
return false;
}
const size_t rpa =
pnl_search(txn->tw.relist, dp->pgno, txn->geo.first_unallocated);
tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.relist) ||
txn->tw.relist[rpa] != dp->pgno);
if (rpa <= MDBX_PNL_GETSIZE(txn->tw.relist) &&
unlikely(txn->tw.relist[rpa] == dp->pgno))
return false;
if (num > 1) {
const size_t rpb = pnl_search(txn->tw.relist, dp->pgno + num - 1,
txn->geo.first_unallocated);
tASSERT(txn, rpa == rpb);
if (unlikely(rpa != rpb))
return false;
}
}
tASSERT(txn, loose == txn->tw.loose_count);
if (unlikely(loose != txn->tw.loose_count))
return false;
tASSERT(txn, pages == dl->pages_including_loose);
if (unlikely(pages != dl->pages_including_loose))
return false;
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) {
const page_t *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]);
tASSERT(txn, !dp);
if (unlikely(dp))
return false;
}
return true;
}
/*----------------------------------------------------------------------------*/
__noinline void dpl_lru_reduce(MDBX_txn *txn) {
NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1);
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
do {
txn->tw.dirtylru >>= 1;
dpl_t *dl = txn->tw.dirtylist;
for (size_t i = 1; i <= dl->length; ++i) {
size_t *const ptr =
ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr >>= 1;
}
txn = txn->parent;
} while (txn);
}
void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) {
tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->geo.first_unallocated
<< spilled));
dpl_t *dl = dpl_sort(txn);
/* Scanning in ascend order */
const intptr_t step = MDBX_PNL_ASCENDING ? 1 : -1;
const intptr_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pl);
const intptr_t end = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(pl) + 1 : 0;
tASSERT(txn, pl[begin] <= pl[end - step]);
size_t w, r = dpl_search(txn, pl[begin] >> spilled);
tASSERT(txn, dl->sorted == dl->length);
for (intptr_t i = begin; r <= dl->length;) { /* scan loop */
assert(i != end);
tASSERT(txn, !spilled || (pl[i] & 1) == 0);
pgno_t pl_pgno = pl[i] >> spilled;
pgno_t dp_pgno = dl->items[r].pgno;
if (likely(dp_pgno != pl_pgno)) {
const bool cmp = dp_pgno < pl_pgno;
r += cmp;
i += cmp ? 0 : step;
if (likely(i != end))
continue;
return;
}
/* update loop */
unsigned npages;
w = r;
remove_dl:
npages = dpl_npages(dl, r);
dl->pages_including_loose -= npages;
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, dl->items[r].ptr, npages);
++r;
next_i:
i += step;
if (unlikely(i == end)) {
while (r <= dl->length)
dl->items[w++] = dl->items[r++];
} else {
while (r <= dl->length) {
assert(i != end);
tASSERT(txn, !spilled || (pl[i] & 1) == 0);
pl_pgno = pl[i] >> spilled;
dp_pgno = dl->items[r].pgno;
if (dp_pgno < pl_pgno)
dl->items[w++] = dl->items[r++];
else if (dp_pgno > pl_pgno)
goto next_i;
else
goto remove_dl;
}
}
dl->sorted = dpl_setlen(dl, w - 1);
txn->tw.dirtyroom += r - w;
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom
: txn->env->options.dp_limit));
return;
}
}
}
void dpl_release_shadows(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
MDBX_env *env = txn->env;
dpl_t *const dl = txn->tw.dirtylist;
for (size_t i = 1; i <= dl->length; i++)
page_shadow_release(env, dl->items[i].ptr, dpl_npages(dl, i));
dpl_clear(dl);
}

145
src/dpl.h Normal file
View File

@ -0,0 +1,145 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
static inline size_t dpl_setlen(dpl_t *dl, size_t len) {
static const page_t dpl_stub_pageE = {INVALID_TXNID,
0,
P_BAD,
{0},
/* pgno */ ~(pgno_t)0};
assert(dpl_stub_pageE.flags == P_BAD && dpl_stub_pageE.pgno == P_INVALID);
dl->length = len;
dl->items[len + 1].ptr = (page_t *)&dpl_stub_pageE;
dl->items[len + 1].pgno = P_INVALID;
dl->items[len + 1].npages = 1;
return len;
}
static inline void dpl_clear(dpl_t *dl) {
static const page_t dpl_stub_pageB = {INVALID_TXNID,
0,
P_BAD,
{0},
/* pgno */ 0};
assert(dpl_stub_pageB.flags == P_BAD && dpl_stub_pageB.pgno == 0);
dl->sorted = dpl_setlen(dl, 0);
dl->pages_including_loose = 0;
dl->items[0].ptr = (page_t *)&dpl_stub_pageB;
dl->items[0].pgno = 0;
dl->items[0].npages = 1;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
}
MDBX_INTERNAL int __must_check_result dpl_alloc(MDBX_txn *txn);
MDBX_INTERNAL void dpl_free(MDBX_txn *txn);
MDBX_INTERNAL dpl_t *dpl_reserve(MDBX_txn *txn, size_t size);
MDBX_INTERNAL __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn);
static inline dpl_t *dpl_sort(const MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->tw.dirtylist;
tASSERT(txn, dl->length <= PAGELIST_LIMIT);
tASSERT(txn, dl->sorted <= dl->length);
tASSERT(txn, dl->items[0].pgno == 0 &&
dl->items[dl->length + 1].pgno == P_INVALID);
return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn);
}
MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
MDBX_MAYBE_UNUSED MDBX_INTERNAL const page_t *
debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno);
MDBX_NOTHROW_PURE_FUNCTION static inline unsigned dpl_npages(const dpl_t *dl,
size_t i) {
assert(0 <= (intptr_t)i && i <= dl->length);
unsigned n = dl->items[i].npages;
assert(n == (is_largepage(dl->items[i].ptr) ? dl->items[i].ptr->pages : 1));
return n;
}
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t dpl_endpgno(const dpl_t *dl,
size_t i) {
return dpl_npages(dl, i) + dl->items[i].pgno;
}
static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno,
size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->tw.dirtylist;
tASSERT(txn, dl->sorted == dl->length);
tASSERT(txn, dl->items[0].pgno == 0 &&
dl->items[dl->length + 1].pgno == P_INVALID);
size_t const n = dpl_search(txn, pgno);
tASSERT(txn, n >= 1 && n <= dl->length + 1);
tASSERT(txn, pgno <= dl->items[n].pgno);
tASSERT(txn, pgno > dl->items[n - 1].pgno);
const bool rc =
/* intersection with founded */ pgno + npages > dl->items[n].pgno ||
/* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno;
if (ASSERT_ENABLED()) {
bool check = false;
for (size_t i = 1; i <= dl->length; ++i) {
const page_t *const dp = dl->items[i].ptr;
if (!(dp->pgno /* begin */ >= /* end */ pgno + npages ||
dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno))
check |= true;
}
tASSERT(txn, check == rc);
}
return rc;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn,
pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->tw.dirtylist;
size_t i = dpl_search(txn, pgno);
tASSERT(txn, (int)i > 0);
return (dl->items[i].pgno == pgno) ? i : 0;
}
MDBX_INTERNAL void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages);
static inline void dpl_remove(const MDBX_txn *txn, size_t i) {
dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i));
}
MDBX_INTERNAL int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno,
page_t *page, size_t npages);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool dpl_check(MDBX_txn *txn);
MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t dpl_age(const MDBX_txn *txn,
size_t i) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
const dpl_t *dl = txn->tw.dirtylist;
assert((intptr_t)i > 0 && i <= dl->length);
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
return txn->tw.dirtylru - (uint32_t)*ptr;
}
MDBX_INTERNAL void dpl_lru_reduce(MDBX_txn *txn);
static inline uint32_t dpl_lru_turn(MDBX_txn *txn) {
txn->tw.dirtylru += 1;
if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) &&
(txn->flags & MDBX_WRITEMAP) == 0)
dpl_lru_reduce(txn);
return txn->tw.dirtylru;
}
MDBX_INTERNAL void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled);
MDBX_INTERNAL void dpl_release_shadows(MDBX_txn *txn);

1553
src/dxb.c Normal file

File diff suppressed because it is too large Load Diff

419
src/env-opts.c Normal file
View File

@ -0,0 +1,419 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__cold static unsigned default_rp_augment_limit(const MDBX_env *env) {
const size_t timeframe = /* 16 секунд */ 16 << 16;
const size_t remain_1sec =
(env->options.gc_time_limit < timeframe)
? timeframe - (size_t)env->options.gc_time_limit
: 0;
const size_t minimum = (env->maxgc_large1page * 2 > MDBX_PNL_INITIAL)
? env->maxgc_large1page * 2
: MDBX_PNL_INITIAL;
const size_t one_third = env->geo_in_bytes.now / 3 >> env->ps2ln;
const size_t augment_limit =
(one_third > minimum)
? minimum + (one_third - minimum) / timeframe * remain_1sec
: minimum;
eASSERT(env, augment_limit < PAGELIST_LIMIT);
return pnl_bytes2size(pnl_size2bytes(augment_limit));
}
static bool default_prefault_write(const MDBX_env *env) {
return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->incore &&
(env->flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP;
}
static bool default_prefer_waf_insteadof_balance(const MDBX_env *env) {
(void)env;
return false;
}
void env_options_init(MDBX_env *env) {
env->options.rp_augment_limit = MDBX_PNL_INITIAL;
env->options.dp_reserve_limit = MDBX_PNL_INITIAL;
env->options.dp_initial = MDBX_PNL_INITIAL;
env->options.spill_max_denominator = 8;
env->options.spill_min_denominator = 8;
env->options.spill_parent4child_denominator = 0;
env->options.dp_loose_limit = 64;
env->options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */;
if (default_prefer_waf_insteadof_balance(env))
env->options.prefer_waf_insteadof_balance = true;
#if !(defined(_WIN32) || defined(_WIN64))
env->options.writethrough_threshold =
#if defined(__linux__) || defined(__gnu_linux__)
globals.running_on_WSL1 ? MAX_PAGENO :
#endif /* Linux */
MDBX_WRITETHROUGH_THRESHOLD_DEFAULT;
#endif /* Windows */
}
void env_options_adjust_defaults(MDBX_env *env) {
if (!env->options.flags.non_auto.rp_augment_limit)
env->options.rp_augment_limit = default_rp_augment_limit(env);
if (!env->options.flags.non_auto.prefault_write)
env->options.prefault_write = default_prefault_write(env);
const size_t basis = env->geo_in_bytes.now;
/* TODO: use options? */
const unsigned factor = 9;
size_t threshold = (basis < ((size_t)65536 << factor))
? 65536 /* minimal threshold */
: (basis > (MEGABYTE * 4 << factor))
? MEGABYTE * 4 /* maximal threshold */
: basis >> factor;
threshold =
(threshold < env->geo_in_bytes.shrink || !env->geo_in_bytes.shrink)
? threshold
: env->geo_in_bytes.shrink;
env->madv_threshold = bytes2pgno(env, bytes_align2os_bytes(env, threshold));
}
//------------------------------------------------------------------------------
__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
uint64_t value) {
int err = check_env(env, false);
if (unlikely(err != MDBX_SUCCESS))
return err;
const bool lock_needed =
((env->flags & ENV_ACTIVE) && env->basal_txn && !env_txn0_owned(env));
bool should_unlock = false;
switch (option) {
case MDBX_opt_sync_bytes:
if (value == /* default */ UINT64_MAX)
value = MAX_WRITE;
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
if (unlikely(!(env->flags & ENV_ACTIVE)))
return MDBX_EPERM;
if (unlikely(value > SIZE_MAX - 65536))
return MDBX_EINVAL;
value = bytes2pgno(env, (size_t)value + env->ps - 1);
if ((uint32_t)value !=
atomic_load32(&env->lck->autosync_threshold, mo_AcquireRelease) &&
atomic_store32(&env->lck->autosync_threshold, (uint32_t)value,
mo_Relaxed)
/* Дергаем sync(force=off) только если задано новое не-нулевое значение
* и мы вне транзакции */
&& lock_needed) {
err = env_sync(env, false, false);
if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE)
err = MDBX_SUCCESS;
}
break;
case MDBX_opt_sync_period:
if (value == /* default */ UINT64_MAX)
value = 2780315 /* 42.42424 секунды */;
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
if (unlikely(!(env->flags & ENV_ACTIVE)))
return MDBX_EPERM;
if (unlikely(value > UINT32_MAX))
return MDBX_EINVAL;
value = osal_16dot16_to_monotime((uint32_t)value);
if (value != atomic_load64(&env->lck->autosync_period, mo_AcquireRelease) &&
atomic_store64(&env->lck->autosync_period, value, mo_Relaxed)
/* Дергаем sync(force=off) только если задано новое не-нулевое значение
* и мы вне транзакции */
&& lock_needed) {
err = env_sync(env, false, false);
if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE)
err = MDBX_SUCCESS;
}
break;
case MDBX_opt_max_db:
if (value == /* default */ UINT64_MAX)
value = 42;
if (unlikely(value > MDBX_MAX_DBI))
return MDBX_EINVAL;
if (unlikely(env->dxb_mmap.base))
return MDBX_EPERM;
env->max_dbi = (unsigned)value + CORE_DBS;
break;
case MDBX_opt_max_readers:
if (value == /* default */ UINT64_MAX)
value = MDBX_READERS_LIMIT;
if (unlikely(value < 1 || value > MDBX_READERS_LIMIT))
return MDBX_EINVAL;
if (unlikely(env->dxb_mmap.base))
return MDBX_EPERM;
env->max_readers = (unsigned)value;
break;
case MDBX_opt_dp_reserve_limit:
if (value == /* default */ UINT64_MAX)
value = INT_MAX;
if (unlikely(value > INT_MAX))
return MDBX_EINVAL;
if (env->options.dp_reserve_limit != (unsigned)value) {
if (lock_needed) {
err = lck_txn_lock(env, false);
if (unlikely(err != MDBX_SUCCESS))
return err;
should_unlock = true;
}
env->options.dp_reserve_limit = (unsigned)value;
while (env->shadow_reserve_len > env->options.dp_reserve_limit) {
eASSERT(env, env->shadow_reserve != nullptr);
page_t *dp = env->shadow_reserve;
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->ps);
VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *));
env->shadow_reserve = page_next(dp);
void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
osal_free(ptr);
env->shadow_reserve_len -= 1;
}
}
break;
case MDBX_opt_rp_augment_limit:
if (value == /* default */ UINT64_MAX) {
env->options.flags.non_auto.rp_augment_limit = 0;
env->options.rp_augment_limit = default_rp_augment_limit(env);
} else if (unlikely(value > PAGELIST_LIMIT))
return MDBX_EINVAL;
else {
env->options.flags.non_auto.rp_augment_limit = 1;
env->options.rp_augment_limit = (unsigned)value;
}
break;
case MDBX_opt_gc_time_limit:
if (value == /* default */ UINT64_MAX)
value = 0;
if (unlikely(value > UINT32_MAX))
return MDBX_EINVAL;
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
value = osal_16dot16_to_monotime((uint32_t)value);
if (value != env->options.gc_time_limit) {
if (env->txn && lock_needed)
return MDBX_EPERM;
env->options.gc_time_limit = value;
if (!env->options.flags.non_auto.rp_augment_limit)
env->options.rp_augment_limit = default_rp_augment_limit(env);
}
break;
case MDBX_opt_txn_dp_limit:
case MDBX_opt_txn_dp_initial:
if (value == /* default */ UINT64_MAX)
value = PAGELIST_LIMIT;
if (unlikely(value > PAGELIST_LIMIT || value < CURSOR_STACK_SIZE * 4))
return MDBX_EINVAL;
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
if (lock_needed) {
err = lck_txn_lock(env, false);
if (unlikely(err != MDBX_SUCCESS))
return err;
should_unlock = true;
}
if (env->txn)
err = MDBX_EPERM /* unable change during transaction */;
else {
const pgno_t value32 = (pgno_t)value;
if (option == MDBX_opt_txn_dp_initial &&
env->options.dp_initial != value32) {
env->options.dp_initial = value32;
if (env->options.dp_limit < value32) {
env->options.dp_limit = value32;
env->options.flags.non_auto.dp_limit = 1;
}
}
if (option == MDBX_opt_txn_dp_limit && env->options.dp_limit != value32) {
env->options.dp_limit = value32;
env->options.flags.non_auto.dp_limit = 1;
if (env->options.dp_initial > value32)
env->options.dp_initial = value32;
}
}
break;
case MDBX_opt_spill_max_denominator:
if (value == /* default */ UINT64_MAX)
value = 8;
if (unlikely(value > 255))
return MDBX_EINVAL;
env->options.spill_max_denominator = (uint8_t)value;
break;
case MDBX_opt_spill_min_denominator:
if (value == /* default */ UINT64_MAX)
value = 8;
if (unlikely(value > 255))
return MDBX_EINVAL;
env->options.spill_min_denominator = (uint8_t)value;
break;
case MDBX_opt_spill_parent4child_denominator:
if (value == /* default */ UINT64_MAX)
value = 0;
if (unlikely(value > 255))
return MDBX_EINVAL;
env->options.spill_parent4child_denominator = (uint8_t)value;
break;
case MDBX_opt_loose_limit:
if (value == /* default */ UINT64_MAX)
value = 64;
if (unlikely(value > 255))
return MDBX_EINVAL;
env->options.dp_loose_limit = (uint8_t)value;
break;
case MDBX_opt_merge_threshold_16dot16_percent:
if (value == /* default */ UINT64_MAX)
value = 65536 / 4 /* 25% */;
if (unlikely(value < 8192 || value > 32768))
return MDBX_EINVAL;
env->options.merge_threshold_16dot16_percent = (unsigned)value;
recalculate_merge_thresholds(env);
break;
case MDBX_opt_writethrough_threshold:
#if defined(_WIN32) || defined(_WIN64)
/* позволяем "установить" значение по-умолчанию и совпадающее
* с поведением соответствующим текущей установке MDBX_NOMETASYNC */
if (value == /* default */ UINT64_MAX &&
value != ((env->flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX))
err = MDBX_EINVAL;
#else
if (value == /* default */ UINT64_MAX)
value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT;
if (value != (unsigned)value)
err = MDBX_EINVAL;
else
env->options.writethrough_threshold = (unsigned)value;
#endif
break;
case MDBX_opt_prefault_write_enable:
if (value == /* default */ UINT64_MAX) {
env->options.prefault_write = default_prefault_write(env);
env->options.flags.non_auto.prefault_write = false;
} else if (value > 1)
err = MDBX_EINVAL;
else {
env->options.prefault_write = value != 0;
env->options.flags.non_auto.prefault_write = true;
}
break;
case MDBX_opt_prefer_waf_insteadof_balance:
if (value == /* default */ UINT64_MAX)
env->options.prefer_waf_insteadof_balance =
default_prefer_waf_insteadof_balance(env);
else if (value > 1)
err = MDBX_EINVAL;
else
env->options.prefer_waf_insteadof_balance = value != 0;
break;
default:
return MDBX_EINVAL;
}
if (should_unlock)
lck_txn_unlock(env);
return err;
}
__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option,
uint64_t *pvalue) {
int err = check_env(env, false);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (unlikely(!pvalue))
return MDBX_EINVAL;
switch (option) {
case MDBX_opt_sync_bytes:
if (unlikely(!(env->flags & ENV_ACTIVE)))
return MDBX_EPERM;
*pvalue = pgno2bytes(
env, atomic_load32(&env->lck->autosync_threshold, mo_Relaxed));
break;
case MDBX_opt_sync_period:
if (unlikely(!(env->flags & ENV_ACTIVE)))
return MDBX_EPERM;
*pvalue = osal_monotime_to_16dot16(
atomic_load64(&env->lck->autosync_period, mo_Relaxed));
break;
case MDBX_opt_max_db:
*pvalue = env->max_dbi - CORE_DBS;
break;
case MDBX_opt_max_readers:
*pvalue = env->max_readers;
break;
case MDBX_opt_dp_reserve_limit:
*pvalue = env->options.dp_reserve_limit;
break;
case MDBX_opt_rp_augment_limit:
*pvalue = env->options.rp_augment_limit;
break;
case MDBX_opt_gc_time_limit:
*pvalue = osal_monotime_to_16dot16(env->options.gc_time_limit);
break;
case MDBX_opt_txn_dp_limit:
*pvalue = env->options.dp_limit;
break;
case MDBX_opt_txn_dp_initial:
*pvalue = env->options.dp_initial;
break;
case MDBX_opt_spill_max_denominator:
*pvalue = env->options.spill_max_denominator;
break;
case MDBX_opt_spill_min_denominator:
*pvalue = env->options.spill_min_denominator;
break;
case MDBX_opt_spill_parent4child_denominator:
*pvalue = env->options.spill_parent4child_denominator;
break;
case MDBX_opt_loose_limit:
*pvalue = env->options.dp_loose_limit;
break;
case MDBX_opt_merge_threshold_16dot16_percent:
*pvalue = env->options.merge_threshold_16dot16_percent;
break;
case MDBX_opt_writethrough_threshold:
#if defined(_WIN32) || defined(_WIN64)
*pvalue = (env->flags & MDBX_NOMETASYNC) ? 0 : INT_MAX;
#else
*pvalue = env->options.writethrough_threshold;
#endif
break;
case MDBX_opt_prefault_write_enable:
*pvalue = env->options.prefault_write;
break;
case MDBX_opt_prefer_waf_insteadof_balance:
*pvalue = env->options.prefer_waf_insteadof_balance;
break;
default:
return MDBX_EINVAL;
}
return MDBX_SUCCESS;
}

679
src/env.c Normal file
View File

@ -0,0 +1,679 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
bool env_txn0_owned(const MDBX_env *env) {
return (env->flags & MDBX_NOSTICKYTHREADS)
? (env->basal_txn->owner != 0)
: (env->basal_txn->owner == osal_thread_self());
}
int env_page_auxbuffer(MDBX_env *env) {
return env->page_auxbuf ? MDBX_SUCCESS
: osal_memalign_alloc(globals.sys_pagesize,
env->ps * (size_t)NUM_METAS,
&env->page_auxbuf);
}
__cold unsigned env_setup_pagesize(MDBX_env *env, const size_t pagesize) {
STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE);
STATIC_ASSERT(MDBX_MIN_PAGESIZE > sizeof(page_t) + sizeof(meta_t));
ENSURE(env, is_powerof2(pagesize));
ENSURE(env, pagesize >= MDBX_MIN_PAGESIZE);
ENSURE(env, pagesize <= MDBX_MAX_PAGESIZE);
env->ps = (unsigned)pagesize;
if (env->page_auxbuf) {
osal_memalign_free(env->page_auxbuf);
env->page_auxbuf = nullptr;
}
STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MIN_PAGESIZE) > 4);
STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MAX_PAGESIZE) < PAGELIST_LIMIT);
const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
ENSURE(env,
maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)PAGELIST_LIMIT / 4);
env->maxgc_large1page = (unsigned)maxgc_ov1page;
env->maxgc_per_branch =
(unsigned)((pagesize - PAGEHDRSZ) /
(sizeof(indx_t) + sizeof(node_t) + sizeof(txnid_t)));
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) >
sizeof(tree_t) + NODESIZE + 42);
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX);
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) >=
BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE));
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) > NODESIZE + 42);
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX);
const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize);
const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize);
ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) &&
branch_nodemax % 2 == 0 &&
leaf_nodemax > (intptr_t)(sizeof(tree_t) + NODESIZE + 42) &&
leaf_nodemax >= branch_nodemax &&
leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0);
env->leaf_nodemax = (uint16_t)leaf_nodemax;
env->branch_nodemax = (uint16_t)branch_nodemax;
env->ps2ln = (uint8_t)log2n_powerof2(pagesize);
eASSERT(env, pgno2bytes(env, 1) == pagesize);
eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2);
recalculate_merge_thresholds(env);
/* TODO: recalculate me_subpage_xyz values from MDBX_opt_subpage_xyz. */
env->subpage_limit = env->leaf_nodemax - NODESIZE;
env->subpage_room_threshold = 0;
env->subpage_reserve_prereq = env->leaf_nodemax;
env->subpage_reserve_limit = env->subpage_limit / 42;
eASSERT(env, env->subpage_reserve_prereq >
env->subpage_room_threshold + env->subpage_reserve_limit);
eASSERT(env, env->leaf_nodemax >= env->subpage_limit + NODESIZE);
const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE);
if (!env->options.flags.non_auto.dp_limit) {
/* auto-setup dp_limit by "The42" ;-) */
intptr_t total_ram_pages, avail_ram_pages;
int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages);
if (unlikely(err != MDBX_SUCCESS))
ERROR("mdbx_get_sysraminfo(), rc %d", err);
else {
size_t reasonable_dpl_limit =
(size_t)(total_ram_pages + avail_ram_pages) / 42;
if (pagesize > globals.sys_pagesize)
reasonable_dpl_limit /= pagesize / globals.sys_pagesize;
else if (pagesize < globals.sys_pagesize)
reasonable_dpl_limit *= globals.sys_pagesize / pagesize;
reasonable_dpl_limit = (reasonable_dpl_limit < PAGELIST_LIMIT)
? reasonable_dpl_limit
: PAGELIST_LIMIT;
reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK_SIZE * 4)
? reasonable_dpl_limit
: CURSOR_STACK_SIZE * 4;
env->options.dp_limit = (unsigned)reasonable_dpl_limit;
}
}
if (env->options.dp_limit > max_pgno - NUM_METAS)
env->options.dp_limit = max_pgno - NUM_METAS;
if (env->options.dp_initial > env->options.dp_limit)
env->options.dp_initial = env->options.dp_limit;
return env->ps;
}
__cold int env_sync(MDBX_env *env, bool force, bool nonblock) {
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
const bool txn0_owned = env_txn0_owned(env);
bool should_unlock = false;
int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */;
retry:;
unsigned flags = env->flags & ~(MDBX_NOMETASYNC | txn_shrink_allowed);
if (unlikely((flags & (ENV_FATAL_ERROR | ENV_ACTIVE)) != ENV_ACTIVE)) {
rc = (flags & ENV_FATAL_ERROR) ? MDBX_PANIC : MDBX_EPERM;
goto bailout;
}
const troika_t troika =
(txn0_owned | should_unlock) ? env->basal_txn->tw.troika : meta_tap(env);
const meta_ptr_t head = meta_recent(env, &troika);
const uint64_t unsynced_pages =
atomic_load64(&env->lck->unsynced_pages, mo_Relaxed);
if (unsynced_pages == 0) {
const uint32_t synched_meta_txnid_u32 =
atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady)
goto bailout;
}
if (should_unlock && (env->flags & MDBX_WRITEMAP) &&
unlikely(head.ptr_c->geometry.first_unallocated >
bytes2pgno(env, env->dxb_mmap.current))) {
if (unlikely(env->stuck_meta >= 0) &&
troika.recent != (uint8_t)env->stuck_meta) {
NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent "
"meta-page (%u)",
"sync datafile", env->stuck_meta, troika.recent);
rc = MDBX_RESULT_TRUE;
} else {
rc = dxb_resize(env, head.ptr_c->geometry.first_unallocated,
head.ptr_c->geometry.now, head.ptr_c->geometry.upper,
implicit_grow);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
}
const size_t autosync_threshold =
atomic_load32(&env->lck->autosync_threshold, mo_Relaxed);
const uint64_t autosync_period =
atomic_load64(&env->lck->autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
(autosync_period &&
(eoos_timestamp =
atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) &&
osal_monotime() - eoos_timestamp >= autosync_period))
flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
if (!txn0_owned) {
if (!should_unlock) {
#if MDBX_ENABLE_PGOP_STAT
unsigned wops = 0;
#endif /* MDBX_ENABLE_PGOP_STAT */
int err;
/* pre-sync to avoid latency for writer */
if (unsynced_pages > /* FIXME: define threshold */ 42 &&
(flags & MDBX_SAFE_NOSYNC) == 0) {
eASSERT(env, ((flags ^ env->flags) & MDBX_WRITEMAP) == 0);
if (flags & MDBX_WRITEMAP) {
/* Acquire guard to avoid collision with remap */
#if defined(_WIN32) || defined(_WIN64)
imports.srwl_AcquireShared(&env->remap_guard);
#else
err = osal_fastmutex_acquire(&env->remap_guard);
if (unlikely(err != MDBX_SUCCESS))
return err;
#endif
const size_t usedbytes =
pgno_align2os_bytes(env, head.ptr_c->geometry.first_unallocated);
err = osal_msync(&env->dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA);
#if defined(_WIN32) || defined(_WIN64)
imports.srwl_ReleaseShared(&env->remap_guard);
#else
int unlock_err = osal_fastmutex_release(&env->remap_guard);
if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS)
err = unlock_err;
#endif
} else
err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA);
if (unlikely(err != MDBX_SUCCESS))
return err;
#if MDBX_ENABLE_PGOP_STAT
wops = 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
/* pre-sync done */
rc = MDBX_SUCCESS /* means "some data was synced" */;
}
err = lck_txn_lock(env, nonblock);
if (unlikely(err != MDBX_SUCCESS))
return err;
should_unlock = true;
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.wops.weak += wops;
#endif /* MDBX_ENABLE_PGOP_STAT */
env->basal_txn->tw.troika = meta_tap(env);
eASSERT(env, !env->txn && !env->basal_txn->nested);
goto retry;
}
eASSERT(env, head.txnid == recent_committed_txnid(env));
env->basal_txn->txnid = head.txnid;
txn_snapshot_oldest(env->basal_txn);
flags |= txn_shrink_allowed;
}
eASSERT(env, txn0_owned || should_unlock);
eASSERT(env, !txn0_owned || (flags & txn_shrink_allowed) == 0);
if (!head.is_steady && unlikely(env->stuck_meta >= 0) &&
troika.recent != (uint8_t)env->stuck_meta) {
NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent "
"meta-page (%u)",
"sync datafile", env->stuck_meta, troika.recent);
rc = MDBX_RESULT_TRUE;
goto bailout;
}
if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64,
data_page(head.ptr_c)->pgno, durable_caption(head.ptr_c),
unsynced_pages);
meta_t meta = *head.ptr_c;
rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->tw.troika);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
/* LY: sync meta-pages if MDBX_NOMETASYNC enabled
* and someone was not synced above. */
if (atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) !=
(uint32_t)head.txnid)
rc = meta_sync(env, head);
bailout:
if (should_unlock)
lck_txn_unlock(env);
return rc;
}
__cold int env_open(MDBX_env *env, mdbx_mode_t mode) {
/* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH:
*
* 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС
* придется чаще обновлять страницы в unified page cache.
*
* Однако, O_DSYNC не предполагает отключение unified page cache,
* поэтому подобные затруднения будем считать проблемой ОС и/или
* ожидаемым пенальти из-за использования мелких страниц БД.
*
* 1) В режиме MDBX_SYNC_DURABLE - O_DSYNC для записи как данных,
* так и мета-страниц. Однако, на Linux отказ от O_DSYNC с последующим
* fdatasync() может быть выгоднее при использовании HDD, так как
* позволяет io-scheduler переупорядочить запись с учетом актуального
* расположения файла БД на носителе.
*
* 2) В режиме MDBX_NOMETASYNC - O_DSYNC можно использовать для данных,
* но в этом может не быть смысла, так как fdatasync() всё равно
* требуется для гарантии фиксации мета после предыдущей транзакции.
*
* В итоге на нормальных системах (не Windows) есть два варианта:
* - при возможности O_DIRECT и/или io_ring для данных, скорее всего,
* есть смысл вызвать fdatasync() перед записью данных, а затем
* использовать O_DSYNC;
* - не использовать O_DSYNC и вызывать fdatasync() после записи данных.
*
* На Windows же следует минимизировать использование FlushFileBuffers()
* из-за проблем с производительностью. Поэтому на Windows в режиме
* MDBX_NOMETASYNC:
* - мета обновляется через дескриптор без FILE_FLAG_WRITE_THROUGH;
* - перед началом записи данных вызывается FlushFileBuffers(), если
* meta_sync_txnid не совпадает с последней записанной мета;
* - данные записываются через дескриптор с FILE_FLAG_WRITE_THROUGH.
*
* 3) В режиме MDBX_SAFE_NOSYNC - O_DSYNC нет смысла использовать, пока не
* будет реализована возможность полностью асинхронной "догоняющей"
* записи в выделенном процессе-сервере с io-ring очередями внутри.
*
* -----
*
* Использование O_DIRECT или FILE_FLAG_NO_BUFFERING:
*
* Назначение этих флагов в отключении файлового дескриптора от
* unified page cache, т.е. от отображенных в память данных в случае
* libmdbx.
*
* Поэтому, использование direct i/o в libmdbx без MDBX_WRITEMAP лишено
* смысла и контр-продуктивно, ибо так мы провоцируем ядро ОС на
* не-когерентность отображения в память с содержимым файла на носителе,
* либо требуем дополнительных проверок и действий направленных на
* фактическое отключение O_DIRECT для отображенных в память данных.
*
* В режиме MDBX_WRITEMAP когерентность отображенных данных обеспечивается
* физически. Поэтому использование direct i/o может иметь смысл, если у
* ядра ОС есть какие-то проблемы с msync(), в том числе с
* производительностью:
* - использование io_ring или gather-write может быть дешевле, чем
* просмотр PTE ядром и запись измененных/грязных;
* - но проблема в том, что записываемые из user mode страницы либо не
* будут помечены чистыми (и соответственно будут записаны ядром
* еще раз), либо ядру необходимо искать и чистить PTE при получении
* запроса на запись.
*
* Поэтому O_DIRECT или FILE_FLAG_NO_BUFFERING используется:
* - только в режиме MDBX_SYNC_DURABLE с MDBX_WRITEMAP;
* - когда ps >= me_os_psize;
* - опция сборки MDBX_AVOID_MSYNC != 0, которая по-умолчанию включена
* только на Windows (см ниже).
*
* -----
*
* Использование FILE_FLAG_OVERLAPPED на Windows:
*
* У Windows очень плохо с I/O (за исключением прямых постраничных
* scatter/gather, которые работают в обход проблемного unified page
* cache и поэтому почти бесполезны в libmdbx).
*
* При этом всё еще хуже при использовании FlushFileBuffers(), что также
* требуется после FlushViewOfFile() в режиме MDBX_WRITEMAP. Поэтому
* на Windows вместо FlushViewOfFile() и FlushFileBuffers() следует
* использовать запись через дескриптор с FILE_FLAG_WRITE_THROUGH.
*
* В свою очередь, запись с FILE_FLAG_WRITE_THROUGH дешевле/быстрее
* при использовании FILE_FLAG_OVERLAPPED. В результате, на Windows
* в durable-режимах запись данных всегда в overlapped-режиме,
* при этом для записи мета требуется отдельный не-overlapped дескриптор.
*/
env->pid = osal_getpid();
int rc = osal_openfile((env->flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ
: MDBX_OPEN_DXB_LAZY,
env, env->pathname.dxb, &env->lazy_fd, mode);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
env->me_sysv_ipc.key = ftok(env->pathname.dxb, 42);
if (unlikely(env->me_sysv_ipc.key == -1))
return errno;
#endif /* MDBX_LOCKING */
/* Set the position in files outside of the data to avoid corruption
* due to erroneous use of file descriptors in the application code. */
const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000);
osal_fseek(env->lazy_fd, safe_parking_lot_offset);
env->fd4meta = env->lazy_fd;
#if defined(_WIN32) || defined(_WIN64)
eASSERT(env, env->ioring.overlapped_fd == 0);
bool ior_direct = false;
if (!(env->flags &
(MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) {
if (MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) {
/* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции
* MDBX_AVOID_MSYNC.
*
* 1) В этой комбинации наиболее выгодно использовать WriteFileGather(),
* но для этого необходимо открыть файл с флагом FILE_FLAG_NO_BUFFERING и
* после обеспечивать выравнивание адресов и размера данных на границу
* системной страницы, что в свою очередь возможно если размер страницы БД
* не меньше размера системной страницы ОЗУ. Поэтому для открытия файла в
* нужном режиме требуется знать размер страницы БД.
*
* 2) Кроме этого, в Windows запись в заблокированный регион файла
* возможно только через тот-же дескриптор. Поэтому изначальный захват
* блокировок посредством lck_seize(), захват/освобождение блокировок
* во время пишущих транзакций и запись данных должны выполнятся через
* один дескриптор.
*
* Таким образом, требуется прочитать волатильный заголовок БД, чтобы
* узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном
* для записи данных, чтобы использовать именно этот дескриптор для
* изначального захвата блокировок. */
meta_t header;
uint64_t dxb_filesize;
int err = dxb_read_header(env, &header, MDBX_SUCCESS, true);
if ((err == MDBX_SUCCESS && header.pagesize >= globals.sys_pagesize) ||
(err == MDBX_ENODATA && mode && env->ps >= globals.sys_pagesize &&
osal_filesize(env->lazy_fd, &dxb_filesize) == MDBX_SUCCESS &&
dxb_filesize == 0))
/* Может быть коллизия, если два процесса пытаются одновременно создать
* БД с разным размером страницы, который у одного меньше системной
* страницы, а у другого НЕ меньше. Эта допустимая, но очень странная
* ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */
ior_direct = true;
}
rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT
: MDBX_OPEN_DXB_OVERLAPPED,
env, env->pathname.dxb, &env->ioring.overlapped_fd, 0);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
env->dxb_lock_event = CreateEventW(nullptr, true, false, nullptr);
if (unlikely(!env->dxb_lock_event))
return (int)GetLastError();
osal_fseek(env->ioring.overlapped_fd, safe_parking_lot_offset);
}
#else
if (mode == 0) {
/* pickup mode for lck-file */
struct stat st;
if (unlikely(fstat(env->lazy_fd, &st)))
return errno;
mode = st.st_mode;
}
mode = (/* inherit read permissions for group and others */ mode &
(S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) |
/* always add read/write for owner */ S_IRUSR | S_IWUSR |
((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) |
((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0);
#endif /* !Windows */
const int lck_rc = lck_setup(env, mode);
if (unlikely(MDBX_IS_ERROR(lck_rc)))
return lck_rc;
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE)
osal_fseek(env->lck_mmap.fd, safe_parking_lot_offset);
eASSERT(env, env->dsync_fd == INVALID_HANDLE_VALUE);
if (!(env->flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | DEPRECATED_MAPASYNC
#if defined(_WIN32) || defined(_WIN64)
| MDBX_EXCLUSIVE
#endif /* !Windows */
))) {
rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env->pathname.dxb,
&env->dsync_fd, 0);
if (unlikely(MDBX_IS_ERROR(rc)))
return rc;
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
if ((env->flags & MDBX_NOMETASYNC) == 0)
env->fd4meta = env->dsync_fd;
osal_fseek(env->dsync_fd, safe_parking_lot_offset);
}
}
const MDBX_env_flags_t lazy_flags =
MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC;
const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM |
MDBX_NORDAHEAD | MDBX_RDONLY |
MDBX_WRITEMAP;
lck_t *const lck = env->lck_mmap.lck;
if (lck && lck_rc != MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) {
MDBX_env_flags_t snap_flags;
while ((snap_flags = atomic_load32(&lck->envmode, mo_AcquireRelease)) ==
MDBX_RDONLY) {
if (atomic_cas32(&lck->envmode, MDBX_RDONLY,
(snap_flags = (env->flags & mode_flags)))) {
/* The case:
* - let's assume that for some reason the DB file is smaller
* than it should be according to the geometry,
* but not smaller than the last page used;
* - the first process that opens the database (lck_rc == RESULT_TRUE)
* does this in readonly mode and therefore cannot bring
* the file size back to normal;
* - some next process (lck_rc != RESULT_TRUE) opens the DB in
* read-write mode and now is here.
*
* FIXME: Should we re-check and set the size of DB-file right here? */
break;
}
atomic_yield();
}
if (env->flags & MDBX_ACCEDE) {
/* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */
const MDBX_env_flags_t diff =
(snap_flags ^ env->flags) &
((snap_flags & lazy_flags) ? mode_flags
: mode_flags & ~MDBX_WRITEMAP);
env->flags ^= diff;
NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->flags ^ diff,
env->flags);
}
/* Ранее упущенный не очевидный момент: При работе БД в режимах
* не-синхронной/отложенной фиксации на диске, все процессы-писатели должны
* иметь одинаковый режим MDBX_WRITEMAP.
*
* В противном случае, сброс на диск следует выполнять дважды: сначала
* msync(), затем fdatasync(). При этом msync() не обязан отрабатывать
* в процессах без MDBX_WRITEMAP, так как файл в память отображен только
* для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не
* позволяют выполнить фиксацию данных на диск, после их изменения в другом
* процессе.
*
* В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP
* также не следует, поскольку никакой процесс (в том числе последний) не
* может гарантированно сбросить данные на диск, а следовательно не должен
* помечать какую-либо транзакцию как steady.
*
* В результате, требуется либо запретить совместную работу процессам с
* разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое
* смешивание и блокировать steady-пометки - что контрпродуктивно. */
const MDBX_env_flags_t rigorous_flags =
(snap_flags & lazy_flags)
? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP
: MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC;
const MDBX_env_flags_t rigorous_diff =
(snap_flags ^ env->flags) & rigorous_flags;
if (rigorous_diff) {
ERROR("current mode/flags 0x%X incompatible with requested 0x%X, "
"rigorous diff 0x%X",
env->flags, snap_flags, rigorous_diff);
return MDBX_INCOMPATIBLE;
}
}
mincore_clean_cache(env);
const int dxb_rc = dxb_setup(env, lck_rc, mode);
if (MDBX_IS_ERROR(dxb_rc))
return dxb_rc;
rc = osal_check_fs_incore(env->lazy_fd);
env->incore = false;
if (rc == MDBX_RESULT_TRUE) {
env->incore = true;
NOTICE("%s", "in-core database");
rc = MDBX_SUCCESS;
} else if (unlikely(rc != MDBX_SUCCESS)) {
ERROR("check_fs_incore(), err %d", rc);
return rc;
}
if (unlikely(/* recovery mode */ env->stuck_meta >= 0) &&
(lck_rc != /* exclusive */ MDBX_RESULT_TRUE ||
(env->flags & MDBX_EXCLUSIVE) == 0)) {
ERROR("%s", "recovery requires exclusive mode");
return MDBX_BUSY;
}
DEBUG("opened dbenv %p", (void *)env);
env->flags |= ENV_ACTIVE;
if (!lck || lck_rc == MDBX_RESULT_TRUE) {
env->lck->envmode.weak = env->flags & mode_flags;
env->lck->meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env);
env->lck->readers_check_timestamp.weak = osal_monotime();
}
if (lck) {
if (lck_rc == MDBX_RESULT_TRUE) {
rc = lck_downgrade(env);
DEBUG("lck-downgrade-%s: rc %i",
(env->flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc);
if (rc != MDBX_SUCCESS)
return rc;
} else {
rc = mvcc_cleanup_dead(env, false, nullptr);
if (MDBX_IS_ERROR(rc))
return rc;
}
}
rc = (env->flags & MDBX_RDONLY)
? MDBX_SUCCESS
: osal_ioring_create(&env->ioring
#if defined(_WIN32) || defined(_WIN64)
,
ior_direct, env->ioring.overlapped_fd
#endif /* Windows */
);
return rc;
}
__cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
const unsigned flags = env->flags;
env->flags &= ~ENV_INTERNAL_FLAGS;
if (flags & ENV_TXKEY) {
thread_key_delete(env->me_txkey);
env->me_txkey = 0;
}
if (env->lck)
munlock_all(env);
rthc_lock();
int rc = rthc_remove(env);
rthc_unlock();
#if MDBX_ENABLE_DBI_LOCKFREE
for (defer_free_item_t *next, *ptr = env->defer_free; ptr; ptr = next) {
next = ptr->next;
osal_free(ptr);
}
env->defer_free = nullptr;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
if (!(env->flags & MDBX_RDONLY))
osal_ioring_destroy(&env->ioring);
env->lck = nullptr;
if (env->lck_mmap.lck)
osal_munmap(&env->lck_mmap);
if (env->dxb_mmap.base) {
osal_munmap(&env->dxb_mmap);
#ifdef ENABLE_MEMCHECK
VALGRIND_DISCARD(env->valgrind_handle);
env->valgrind_handle = -1;
#endif /* ENABLE_MEMCHECK */
}
#if defined(_WIN32) || defined(_WIN64)
eASSERT(env, !env->ioring.overlapped_fd ||
env->ioring.overlapped_fd == INVALID_HANDLE_VALUE);
if (env->dxb_lock_event != INVALID_HANDLE_VALUE) {
CloseHandle(env->dxb_lock_event);
env->dxb_lock_event = INVALID_HANDLE_VALUE;
}
eASSERT(env, !resurrect_after_fork);
if (env->pathname_char) {
osal_free(env->pathname_char);
env->pathname_char = nullptr;
}
#endif /* Windows */
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->dsync_fd);
env->dsync_fd = INVALID_HANDLE_VALUE;
}
if (env->lazy_fd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->lazy_fd);
env->lazy_fd = INVALID_HANDLE_VALUE;
}
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->lck_mmap.fd);
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
}
if (!resurrect_after_fork) {
if (env->kvs) {
for (size_t i = CORE_DBS; i < env->n_dbi; ++i)
if (env->kvs[i].name.iov_len)
osal_free(env->kvs[i].name.iov_base);
osal_free(env->kvs);
env->n_dbi = CORE_DBS;
env->kvs = nullptr;
}
if (env->page_auxbuf) {
osal_memalign_free(env->page_auxbuf);
env->page_auxbuf = nullptr;
}
if (env->dbi_seqs) {
osal_free(env->dbi_seqs);
env->dbi_seqs = nullptr;
}
if (env->dbs_flags) {
osal_free(env->dbs_flags);
env->dbs_flags = nullptr;
}
if (env->pathname.buffer) {
osal_free(env->pathname.buffer);
env->pathname.buffer = nullptr;
}
if (env->basal_txn) {
dpl_free(env->basal_txn);
txl_free(env->basal_txn->tw.gc.reclaimed);
pnl_free(env->basal_txn->tw.retired_pages);
pnl_free(env->basal_txn->tw.spilled.list);
pnl_free(env->basal_txn->tw.relist);
osal_free(env->basal_txn);
env->basal_txn = nullptr;
}
}
env->stuck_meta = -1;
return rc;
}

136
src/essentials.h Normal file
View File

@ -0,0 +1,136 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#define LIBMDBX_INTERNALS
#define MDBX_DEPRECATED
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
#include "preface.h"
#ifdef xMDBX_ALLOY
/* Amalgamated build */
#define MDBX_INTERNAL static
#else
/* Non-amalgamated build */
#define MDBX_INTERNAL
#endif /* xMDBX_ALLOY */
#include "../mdbx.h"
/*----------------------------------------------------------------------------*/
/* Basic constants and types */
typedef struct iov_ctx iov_ctx_t;
#include "osal.h"
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
#endif /* MDBX_WORDBITS */
#include "options.h"
#include "atomics-types.h"
#include "layout-dxb.h"
#include "layout-lck.h"
#define MIN_MAPSIZE (MDBX_MIN_PAGESIZE * MIN_PAGENO)
#if defined(_WIN32) || defined(_WIN64)
#define MAX_MAPSIZE32 UINT32_C(0x38000000)
#else
#define MAX_MAPSIZE32 UINT32_C(0x7f000000)
#endif
#define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MDBX_MAX_PAGESIZE)
#if MDBX_WORDBITS >= 64
#define MAX_MAPSIZE MAX_MAPSIZE64
#define PAGELIST_LIMIT ((size_t)MAX_PAGENO)
#else
#define MAX_MAPSIZE MAX_MAPSIZE32
#define PAGELIST_LIMIT (MAX_MAPSIZE32 / MDBX_MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482
#define MEGABYTE ((size_t)1 << 20)
/*----------------------------------------------------------------------------*/
union logger_union {
void *ptr;
MDBX_debug_func *fmt;
MDBX_debug_func_nofmt *nofmt;
};
struct libmdbx_globals {
bin128_t bootid;
unsigned sys_pagesize, sys_allocation_granularity;
uint8_t sys_pagesize_ln2;
uint8_t runtime_flags;
uint8_t loglevel;
#if defined(_WIN32) || defined(_WIN64)
bool running_under_Wine;
#elif defined(__linux__) || defined(__gnu_linux__)
bool running_on_WSL1 /* Windows Subsystem 1 for Linux */;
uint32_t linux_kernel_version;
#endif /* Linux */
union logger_union logger;
osal_fastmutex_t debug_lock;
size_t logger_buffer_size;
char *logger_buffer;
};
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
extern struct libmdbx_globals globals;
#if defined(_WIN32) || defined(_WIN64)
extern struct libmdbx_imports imports;
#endif /* Windows */
#include "logging_and_debug.h"
#include "utils.h"
#include "pnl.h"
#ifdef __cplusplus
}
#endif /* __cplusplus */
#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
#if defined(xMDBX_TOOLS)
extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif
#define MDBX_IS_ERROR(rc) \
((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
/*----------------------------------------------------------------------------*/
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t
int64pgno(int64_t i64) {
if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1))
return (pgno_t)i64;
return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t
pgno_add(size_t base, size_t augend) {
assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO);
return int64pgno((int64_t)base + (int64_t)augend);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t
pgno_sub(size_t base, size_t subtrahend) {
assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 &&
subtrahend < MAX_PAGENO);
return int64pgno((int64_t)base - (int64_t)subtrahend);
}

1460
src/gc-get.c Normal file

File diff suppressed because it is too large Load Diff

1094
src/gc-put.c Normal file

File diff suppressed because it is too large Load Diff

39
src/gc.h Normal file
View File

@ -0,0 +1,39 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
typedef struct gc_update_context {
size_t loop, reserve_adj;
size_t retired_stored;
size_t amount, reserved, cleaned_slot, reused_slot, fill_idx;
txnid_t cleaned_id, rid;
bool lifo, dense;
#if MDBX_ENABLE_BIGFOOT
txnid_t bigfoot;
#endif /* MDBX_ENABLE_BIGFOOT */
union {
MDBX_cursor cursor;
cursor_couple_t couple;
};
} gcu_t;
static inline int gc_update_init(MDBX_txn *txn, gcu_t *ctx) {
memset(ctx, 0, offsetof(gcu_t, cursor));
ctx->lifo = (txn->env->flags & MDBX_LIFORECLAIM) != 0;
#if MDBX_ENABLE_BIGFOOT
ctx->bigfoot = txn->txnid;
#endif /* MDBX_ENABLE_BIGFOOT */
return cursor_init(&ctx->cursor, txn, FREE_DBI);
}
#define ALLOC_DEFAULT 0
#define ALLOC_RESERVE 1
#define ALLOC_UNIMPORTANT 2
MDBX_INTERNAL pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num,
uint8_t flags);
MDBX_INTERNAL pgr_t gc_alloc_single(const MDBX_cursor *const mc);
MDBX_INTERNAL int gc_update(MDBX_txn *txn, gcu_t *ctx);

476
src/global.c Normal file
View File

@ -0,0 +1,476 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
static void mdbx_init(void);
static void mdbx_fini(void);
/*----------------------------------------------------------------------------*/
/* mdbx constructor/destructor */
#if defined(_WIN32) || defined(_WIN64)
#if MDBX_BUILD_SHARED_LIBRARY
#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG)
/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks.
*
* Define dll's entry point only for Release build when NDEBUG is defined and
* MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will
* automatically use DllMainCRTStartup() from CRT library, which also
* automatically call DllMain() from our mdbx.dll */
#pragma comment(linker, "/ENTRY:DllMain")
#endif /* MDBX_WITHOUT_MSVC_CRT */
BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
#else
#if !MDBX_MANUAL_MODULE_HANDLER
static
#endif /* !MDBX_MANUAL_MODULE_HANDLER */
void NTAPI
mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
#endif /* MDBX_BUILD_SHARED_LIBRARY */
{
(void)reserved;
switch (reason) {
case DLL_PROCESS_ATTACH:
windows_import();
mdbx_init();
break;
case DLL_PROCESS_DETACH:
mdbx_fini();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
rthc_thread_dtor(module);
break;
}
#if MDBX_BUILD_SHARED_LIBRARY
return TRUE;
#endif
}
#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER
/* *INDENT-OFF* */
/* clang-format off */
#if defined(_MSC_VER)
# pragma const_seg(push)
# pragma data_seg(push)
# ifndef _M_IX86
/* kick a linker to create the TLS directory if not already done */
# pragma comment(linker, "/INCLUDE:_tls_used")
/* Force some symbol references. */
# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor")
/* specific const-segment for WIN64 */
# pragma const_seg(".CRT$XLB")
const
# else
/* kick a linker to create the TLS directory if not already done */
# pragma comment(linker, "/INCLUDE:__tls_used")
/* Force some symbol references. */
# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor")
/* specific data-segment for WIN32 */
# pragma data_seg(".CRT$XLB")
# endif
__declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler;
# pragma data_seg(pop)
# pragma const_seg(pop)
#elif defined(__GNUC__)
# ifndef _M_IX86
const
# endif
PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler;
#else
# error FIXME
#endif
/* *INDENT-ON* */
/* clang-format on */
#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */
#else
#if defined(__linux__) || defined(__gnu_linux__)
#include <sys/utsname.h>
MDBX_EXCLUDE_FOR_GPROF
__cold static uint8_t probe_for_WSL(const char *tag) {
const char *const WSL = strstr(tag, "WSL");
if (WSL && WSL[3] >= '2' && WSL[3] <= '9')
return WSL[3] - '0';
const char *const wsl = strstr(tag, "wsl");
if (wsl && wsl[3] >= '2' && wsl[3] <= '9')
return wsl[3] - '0';
if (WSL || wsl || strcasestr(tag, "Microsoft"))
/* Expecting no new kernel within WSL1, either it will explicitly
* marked by an appropriate WSL-version hint. */
return (globals.linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2;
return 0;
}
#endif /* Linux */
#ifdef ENABLE_GPROF
extern void _mcleanup(void);
extern void monstartup(unsigned long, unsigned long);
extern void _init(void);
extern void _fini(void);
extern void __gmon_start__(void) __attribute__((__weak__));
#endif /* ENABLE_GPROF */
MDBX_EXCLUDE_FOR_GPROF
__cold static __attribute__((__constructor__)) void
mdbx_global_constructor(void) {
#ifdef ENABLE_GPROF
if (!&__gmon_start__)
monstartup((uintptr_t)&_init, (uintptr_t)&_fini);
#endif /* ENABLE_GPROF */
#if defined(__linux__) || defined(__gnu_linux__)
struct utsname buffer;
if (uname(&buffer) == 0) {
int i = 0;
char *p = buffer.release;
while (*p && i < 4) {
if (*p >= '0' && *p <= '9') {
long number = strtol(p, &p, 10);
if (number > 0) {
if (number > 255)
number = 255;
globals.linux_kernel_version += number << (24 - i * 8);
}
++i;
} else {
++p;
}
}
/* "Official" way of detecting WSL1 but not WSL2
* https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364
*
* WARNING: False negative detection of WSL1 will result in DATA LOSS!
* So, the REQUIREMENTS for this code:
* 1. MUST detect WSL1 without false-negatives.
* 2. DESIRABLE detect WSL2 but without the risk of violating the first. */
globals.running_on_WSL1 = probe_for_WSL(buffer.version) == 1 ||
probe_for_WSL(buffer.sysname) == 1 ||
probe_for_WSL(buffer.release) == 1;
}
#endif /* Linux */
mdbx_init();
}
MDBX_EXCLUDE_FOR_GPROF
__cold static __attribute__((__destructor__)) void
mdbx_global_destructor(void) {
mdbx_fini();
#ifdef ENABLE_GPROF
if (!&__gmon_start__)
_mcleanup();
#endif /* ENABLE_GPROF */
}
#endif /* ! Windows */
/******************************************************************************/
struct libmdbx_globals globals;
__cold static void mdbx_init(void) {
globals.runtime_flags = ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT +
((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT;
globals.loglevel = MDBX_LOG_FATAL;
ENSURE(nullptr, osal_fastmutex_init(&globals.debug_lock) == 0);
osal_ctor();
assert(globals.sys_pagesize > 0 &&
(globals.sys_pagesize & (globals.sys_pagesize - 1)) == 0);
rthc_ctor();
#if MDBX_DEBUG
ENSURE(nullptr, troika_verify_fsm());
ENSURE(nullptr, pv2pages_verify());
#endif /* MDBX_DEBUG*/
}
MDBX_EXCLUDE_FOR_GPROF
__cold static void mdbx_fini(void) {
const uint32_t current_pid = osal_getpid();
TRACE(">> pid %d", current_pid);
rthc_dtor(current_pid);
osal_dtor();
TRACE("<< pid %d\n", current_pid);
ENSURE(nullptr, osal_fastmutex_destroy(&globals.debug_lock) == 0);
}
/******************************************************************************/
/* *INDENT-OFF* */
/* clang-format off */
__dll_export
#ifdef __attribute_used__
__attribute_used__
#elif defined(__GNUC__) || __has_attribute(__used__)
__attribute__((__used__))
#endif
#ifdef __attribute_externally_visible__
__attribute_externally_visible__
#elif (defined(__GNUC__) && !defined(__clang__)) || \
__has_attribute(__externally_visible__)
__attribute__((__externally_visible__))
#endif
const struct MDBX_build_info mdbx_build = {
#ifdef MDBX_BUILD_TIMESTAMP
MDBX_BUILD_TIMESTAMP
#else
"\"" __DATE__ " " __TIME__ "\""
#endif /* MDBX_BUILD_TIMESTAMP */
,
#ifdef MDBX_BUILD_TARGET
MDBX_BUILD_TARGET
#else
#if defined(__ANDROID_API__)
"Android" MDBX_STRINGIFY(__ANDROID_API__)
#elif defined(__linux__) || defined(__gnu_linux__)
"Linux"
#elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__)
"webassembly"
#elif defined(__CYGWIN__)
"CYGWIN"
#elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \
|| defined(__WINDOWS__)
"Windows"
#elif defined(__APPLE__)
#if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \
|| (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR)
"iOS"
#else
"MacOS"
#endif
#elif defined(__FreeBSD__)
"FreeBSD"
#elif defined(__DragonFly__)
"DragonFlyBSD"
#elif defined(__NetBSD__)
"NetBSD"
#elif defined(__OpenBSD__)
"OpenBSD"
#elif defined(__bsdi__)
"UnixBSDI"
#elif defined(__MACH__)
"MACH"
#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC))
"HPUX"
#elif defined(_AIX)
"AIX"
#elif defined(__sun) && defined(__SVR4)
"Solaris"
#elif defined(__BSD__) || defined(BSD)
"UnixBSD"
#elif defined(__unix__) || defined(UNIX) || defined(__unix) \
|| defined(__UNIX) || defined(__UNIX__)
"UNIX"
#elif defined(_POSIX_VERSION)
"POSIX" MDBX_STRINGIFY(_POSIX_VERSION)
#else
"UnknownOS"
#endif /* Target OS */
"-"
#if defined(__amd64__)
"AMD64"
#elif defined(__ia32__)
"IA32"
#elif defined(__e2k__) || defined(__elbrus__)
"Elbrus"
#elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
"Alpha"
#elif defined(__aarch64__) || defined(_M_ARM64)
"ARM64"
#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \
|| defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \
|| defined(_M_ARMT) || defined(__arm)
"ARM"
#elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64))
"MIPS64"
#elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__)
"MIPS"
#elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64)
"PARISC64"
#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
"PARISC"
#elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \
|| defined(__IA64__) || defined(_M_IA64) || defined(__itanium__)
"Itanium"
#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \
|| defined(__powerpc64) || defined(_ARCH_PPC64)
"PowerPC64"
#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \
|| defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__)
"PowerPC"
#elif defined(__sparc64__) || defined(__sparc64)
"SPARC64"
#elif defined(__sparc__) || defined(__sparc)
"SPARC"
#elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch)
"S390"
#else
"UnknownARCH"
#endif
#endif /* MDBX_BUILD_TARGET */
#ifdef MDBX_BUILD_TYPE
# if defined(_MSC_VER)
# pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE)
# endif
"-" MDBX_BUILD_TYPE
#endif /* MDBX_BUILD_TYPE */
,
"MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG)
#ifdef ENABLE_GPROF
" ENABLE_GPROF"
#endif /* ENABLE_GPROF */
" MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS)
" BYTE_ORDER="
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
"LITTLE_ENDIAN"
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
"BIG_ENDIAN"
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
" MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT)
" MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG
" MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
" MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG
" MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG
" MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG
" MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC)
" MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
" MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE)
" MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE)
" MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT)
" MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC)
#if MDBX_DISABLE_VALIDATION
" MDBX_DISABLE_VALIDATION=YES"
#endif /* MDBX_DISABLE_VALIDATION */
#ifdef __SANITIZE_ADDRESS__
" SANITIZE_ADDRESS=YES"
#endif /* __SANITIZE_ADDRESS__ */
#ifdef ENABLE_MEMCHECK
" ENABLE_MEMCHECK=YES"
#endif /* ENABLE_MEMCHECK */
#if MDBX_FORCE_ASSERTIONS
" MDBX_FORCE_ASSERTIONS=YES"
#endif /* MDBX_FORCE_ASSERTIONS */
#ifdef _GNU_SOURCE
" _GNU_SOURCE=YES"
#else
" _GNU_SOURCE=NO"
#endif /* _GNU_SOURCE */
#ifdef __APPLE__
" MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY)
#endif /* MacOS */
#if defined(_WIN32) || defined(_WIN64)
" MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_WITHOUT_MSVC_CRT)
" MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY)
#if !MDBX_BUILD_SHARED_LIBRARY
" MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER)
#endif
" WINVER=" MDBX_STRINGIFY(WINVER)
#else /* Windows */
" MDBX_LOCKING=" MDBX_LOCKING_CONFIG
" MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG
#endif /* !Windows */
" MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE)
" MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT)
" MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE)
" MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE)
" MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK)
" MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING)
,
#ifdef MDBX_BUILD_COMPILER
MDBX_BUILD_COMPILER
#else
#ifdef __INTEL_COMPILER
"Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER)
#elif defined(__apple_build_version__)
"Apple clang " MDBX_STRINGIFY(__apple_build_version__)
#elif defined(__ibmxl__)
"IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__)
"." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__)
#elif defined(__clang__)
"clang " MDBX_STRINGIFY(__clang_version__)
#elif defined(__MINGW64__)
"MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION)
#elif defined(__MINGW32__)
"MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION)
#elif defined(__MINGW__)
"MINGW " MDBX_STRINGIFY(__MINGW_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW_MINOR_VERSION)
#elif defined(__IBMC__)
"IBM C " MDBX_STRINGIFY(__IBMC__)
#elif defined(__GNUC__)
"GNU C/C++ "
#ifdef __VERSION__
__VERSION__
#else
MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__)
#endif
#elif defined(_MSC_VER)
"MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD)
#else
"Unknown compiler"
#endif
#endif /* MDBX_BUILD_COMPILER */
,
#ifdef MDBX_BUILD_FLAGS_CONFIG
MDBX_BUILD_FLAGS_CONFIG
#endif /* MDBX_BUILD_FLAGS_CONFIG */
#ifdef MDBX_BUILD_FLAGS
MDBX_BUILD_FLAGS
#endif /* MDBX_BUILD_FLAGS */
#if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS))
"undefined (please use correct build script)"
#ifdef _MSC_VER
#pragma message("warning: Build flags undefined. Please use correct build script")
#else
#warning "Build flags undefined. Please use correct build script"
#endif // _MSC_VER
#endif
};
#ifdef __SANITIZE_ADDRESS__
#if !defined(_MSC_VER) || __has_attribute(weak)
LIBMDBX_API __attribute__((__weak__))
#endif
const char *__asan_default_options(void) {
return "symbolize=1:allow_addr2line=1:"
#if MDBX_DEBUG
"debug=1:"
"verbosity=2:"
#endif /* MDBX_DEBUG */
"log_threads=1:"
"report_globals=1:"
"replace_str=1:replace_intrin=1:"
"malloc_context_size=9:"
#if !defined(__APPLE__)
"detect_leaks=1:"
#endif
"check_printf=1:"
"detect_deadlocks=1:"
#ifndef LTO_ENABLED
"check_initialization_order=1:"
#endif
"detect_stack_use_after_return=1:"
"intercept_tls_get_addr=1:"
"decorate_proc_maps=1:"
"abort_on_error=1";
}
#endif /* __SANITIZE_ADDRESS__ */
/* *INDENT-ON* */
/* clang-format on */

File diff suppressed because it is too large Load Diff

306
src/layout-dxb.h Normal file
View File

@ -0,0 +1,306 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
#pragma pack(push, 4)
/* A stamp that identifies a file as an MDBX file.
* There's nothing special about this value other than that it is easily
* recognizable, and it will reflect any byte order mismatches. */
#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
#define MDBX_DATA_MAGIC \
((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_LEGACY_COMPAT \
((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
#define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
/* handle for the default DB. */
#define MAIN_DBI 1
/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
#define CORE_DBS 2
/* Number of meta pages - also hardcoded elsewhere */
#define NUM_METAS 3
/* A page number in the database.
*
* MDBX uses 32 bit for page numbers. This limits database
* size up to 2^44 bytes, in case of 4K pages. */
typedef uint32_t pgno_t;
typedef mdbx_atomic_uint32_t atomic_pgno_t;
#define PRIaPGNO PRIu32
#define MAX_PAGENO UINT32_C(0x7FFFffff)
#define MIN_PAGENO NUM_METAS
/* An invalid page number.
* Mainly used to denote an empty tree. */
#define P_INVALID (~(pgno_t)0)
/* A transaction ID. */
typedef uint64_t txnid_t;
typedef mdbx_atomic_uint64_t atomic_txnid_t;
#define PRIaTXN PRIi64
#define MIN_TXNID UINT64_C(1)
#define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
#define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
#define INVALID_TXNID UINT64_MAX
/* Used for offsets within a single page. */
typedef uint16_t indx_t;
typedef struct tree {
uint16_t flags; /* see mdbx_dbi_open */
uint16_t height; /* height of this tree */
uint32_t dupfix_size; /* key-size for MDBX_DUPFIXED (DUPFIX pages) */
pgno_t root; /* the root page of this tree */
pgno_t branch_pages; /* number of internal pages */
pgno_t leaf_pages; /* number of leaf pages */
pgno_t large_pages; /* number of large pages */
uint64_t sequence; /* table sequence counter */
uint64_t items; /* number of data items */
uint64_t mod_txnid; /* txnid of last committed modification */
} tree_t;
/* database size-related parameters */
typedef struct geo {
uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential
quantized) value */
uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
(exponential quantized) value */
pgno_t lower; /* minimal size of datafile in pages */
pgno_t upper; /* maximal size of datafile in pages */
union {
pgno_t now; /* current size of datafile in pages */
pgno_t end_pgno;
};
union {
pgno_t first_unallocated; /* first unused page in the datafile,
but actually the file may be shorter. */
pgno_t next_pgno;
};
} geo_t;
typedef union bin128 {
__anonymous_struct_extension__ struct {
uint64_t x, y;
};
__anonymous_struct_extension__ struct {
uint32_t a, b, c, d;
};
} bin128_t;
/* Meta page content.
* A meta page is the start point for accessing a database snapshot.
* Pages 0-2 are meta pages. */
typedef struct meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
uint32_t magic_and_version[2];
/* txnid that committed this meta, the first of a two-phase-update pair */
union {
mdbx_atomic_uint32_t txnid_a[2];
uint64_t unsafe_txnid;
};
uint16_t reserve16; /* extra flags, zero (nothing) for now */
uint8_t validator_id; /* ID of checksum and page validation method,
* zero (nothing) for now */
int8_t extra_pagehdr; /* extra bytes in the page header,
* zero (nothing) for now */
geo_t geometry; /* database size-related parameters */
union {
struct {
tree_t gc, main;
} trees;
__anonymous_struct_extension__ struct {
uint16_t gc_flags;
uint16_t gc_height;
uint32_t pagesize;
};
};
MDBX_canary canary;
#define DATASIGN_NONE 0u
#define DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > DATASIGN_WEAK)
union {
uint32_t sign[2];
uint64_t unsafe_sign;
};
/* txnid that committed this meta, the second of a two-phase-update pair */
mdbx_atomic_uint32_t txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with reader.snapshot_pages_retired allows fast
* estimation of "how much reader is restraining GC recycling". */
uint32_t pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
* If there was no reboot, but there is no need to rollback to the last
* steady sync point. Zeros mean that no relevant information is available
* from the system. */
bin128_t bootid;
} meta_t;
#pragma pack(1)
typedef enum page_type {
P_BRANCH = 0x01u /* branch page */,
P_LEAF = 0x02u /* leaf page */,
P_LARGE = 0x04u /* large/overflow page */,
P_META = 0x08u /* meta page */,
P_LEGACY_DIRTY = 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */,
P_BAD = P_LEGACY_DIRTY /* explicit flag for invalid/bad page */,
P_DUPFIX = 0x20u /* for MDBX_DUPFIXED records */,
P_SUBP = 0x40u /* for MDBX_DUPSORT sub-pages */,
P_SPILLED = 0x2000u /* spilled in parent txn */,
P_LOOSE = 0x4000u /* page was dirtied then freed, can be reused */,
P_FROZEN = 0x8000u /* used for retire page with known status */,
P_ILL_BITS = (uint16_t)~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE | P_SPILLED),
page_broken = 0,
page_large = P_LARGE,
page_branch = P_BRANCH,
page_leaf = P_LEAF,
page_dupfix_leaf = P_DUPFIX,
page_sub_leaf = P_SUBP | P_LEAF,
page_sub_dupfix_leaf = P_SUBP | P_DUPFIX,
page_sub_broken = P_SUBP,
} page_type_t;
/* Common header for all page types. The page type depends on flags.
*
* P_BRANCH and P_LEAF pages have unsorted 'node_t's at the end, with
* sorted entries[] entries referring to them. Exception: P_DUPFIX pages
* omit entries and pack sorted MDBX_DUPFIXED values after the page header.
*
* P_LARGE records occupy one or more contiguous pages where only the
* first has a page header. They hold the real data of N_BIGDATA nodes.
*
* P_SUBP sub-pages are small leaf "pages" with duplicate data.
* A node with flag N_DUPDATA but not N_SUBDATA contains a sub-page.
* (Duplicate data can also go in sub-databases, which use normal pages.)
*
* P_META pages contain meta_t, the start point of an MDBX snapshot.
*
* Each non-metapage up to meta_t.mm_last_pg is reachable exactly once
* in the snapshot: Either used by a database or listed in a GC record. */
typedef struct page {
uint64_t txnid; /* txnid which created page, maybe zero in legacy DB */
uint16_t dupfix_ksize; /* key size if this is a DUPFIX page */
uint16_t flags;
union {
uint32_t pages; /* number of overflow pages */
__anonymous_struct_extension__ struct {
indx_t lower; /* lower bound of free space */
indx_t upper; /* upper bound of free space */
};
};
pgno_t pgno; /* page number */
#if FLEXIBLE_ARRAY_MEMBERS
indx_t entries[] /* dynamic size */;
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} page_t;
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ 20u
/* Header for a single key/data pair within a page.
* Used in pages of type P_BRANCH and P_LEAF without P_DUPFIX.
* We guarantee 2-byte alignment for 'node_t's.
*
* Leaf node flags describe node contents. N_BIGDATA says the node's
* data part is the page number of an overflow page with actual data.
* N_DUPDATA and N_SUBDATA can be combined giving duplicate data in
* a sub-page/sub-database, and named databases (just N_SUBDATA). */
typedef struct node {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
union {
uint32_t dsize;
uint32_t child_pgno;
};
uint8_t flags; /* see node_flags */
uint8_t extra;
uint16_t ksize; /* key size */
#else
uint16_t ksize; /* key size */
uint8_t extra;
uint8_t flags; /* see node_flags */
union {
uint32_t child_pgno;
uint32_t dsize;
};
#endif /* __BYTE_ORDER__ */
#if FLEXIBLE_ARRAY_MEMBERS
uint8_t payload[] /* key and data are appended here */;
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} node_t;
/* Size of the node header, excluding dynamic data at the end */
#define NODESIZE 8u
typedef enum node_flags {
N_BIGDATA = 0x01 /* data put on large page */,
N_SUBDATA = 0x02 /* data is a sub-database */,
N_DUPDATA = 0x04 /* data has duplicates */
} node_flags_t;
#pragma pack(pop)
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
page_type(const page_t *mp) {
return mp->flags;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
page_type_compat(const page_t *mp) {
/* Drop legacy P_DIRTY flag for sub-pages for compatilibity,
* for assertions only. */
return unlikely(mp->flags & P_SUBP) ? mp->flags & ~(P_SUBP | P_LEGACY_DIRTY)
: mp->flags;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_leaf(const page_t *mp) {
return (mp->flags & P_LEAF) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_dupfix_leaf(const page_t *mp) {
return (mp->flags & P_DUPFIX) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_branch(const page_t *mp) {
return (mp->flags & P_BRANCH) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_largepage(const page_t *mp) {
return (mp->flags & P_LARGE) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_subpage(const page_t *mp) {
return (mp->flags & P_SUBP) != 0;
}

285
src/layout-lck.h Normal file
View File

@ -0,0 +1,285 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 5
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_LCK_SIGN UINT32_C(0xF10C)
typedef void osal_ipclock_t;
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
#define MDBX_LCK_SIGN UINT32_C(0xF18D)
typedef mdbx_pid_t osal_ipclock_t;
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
MDBX_LOCKING == MDBX_LOCKING_POSIX2008
#define MDBX_LCK_SIGN UINT32_C(0x8017)
typedef pthread_mutex_t osal_ipclock_t;
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
#define MDBX_LCK_SIGN UINT32_C(0xFC29)
typedef sem_t osal_ipclock_t;
#else
#error "FIXME"
#endif /* MDBX_LOCKING */
/* Статистика профилирования работы GC */
typedef struct gc_prof_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Процессорное время в режим пользователя
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
} gc_prof_stat_t;
/* Statistics of pages operations for all transactions,
* including incomplete and aborted. */
typedef struct pgops {
mdbx_atomic_uint64_t newly; /* Quantity of a new pages added */
mdbx_atomic_uint64_t cow; /* Quantity of pages copied for update */
mdbx_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
for nested transactions */
mdbx_atomic_uint64_t split; /* Page splits */
mdbx_atomic_uint64_t merge; /* Page merges */
mdbx_atomic_uint64_t spill; /* Quantity of spilled dirty pages */
mdbx_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
mdbx_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
mdbx_atomic_uint64_t
msync; /* Number of explicit msync/flush-to-disk operations */
mdbx_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */
mdbx_atomic_uint64_t prefault; /* Number of prefault write operations */
mdbx_atomic_uint64_t mincore; /* Number of mincore() calls */
mdbx_atomic_uint32_t
incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269
caught */
mdbx_atomic_uint32_t reserved;
/* Статистика для профилирования GC.
* Логически эти данные, возможно, стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
gc_prof_stat_t work;
/* Затраты на поддержку и обновления самой GC */
gc_prof_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
/* Reader Lock Table
*
* Readers don't acquire any locks for their data access. Instead, they
* simply record their transaction ID in the reader table. The reader
* mutex is needed just to find an empty slot in the reader table. The
* slot's address is saved in thread-specific data so that subsequent
* read transactions started by the same thread need no further locking to
* proceed.
*
* If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in
* thread-specific data. No reader table is used if the database is on a
* read-only filesystem.
*
* Since the database uses multi-version concurrency control, readers don't
* actually need any locking. This table is used to keep track of which
* readers are using data from which old transactions, so that we'll know
* when a particular old transaction is no longer in use. Old transactions
* that have discarded any data pages can then have those pages reclaimed
* for use by a later write transaction.
*
* The lock table is constructed such that reader slots are aligned with the
* processor's cache line size. Any slot is only ever used by one thread.
* This alignment guarantees that there will be no contention or cache
* thrashing as threads update their own slot info, and also eliminates
* any need for locking when accessing a slot.
*
* A writer thread will scan every slot in the table to determine the oldest
* outstanding reader transaction. Any freed pages older than this will be
* reclaimed by the writer. The writer doesn't use any locks when scanning
* this table. This means that there's no guarantee that the writer will
* see the most up-to-date reader info, but that's not required for correct
* operation - all we need is to know the upper bound on the oldest reader,
* we don't care at all about the newest reader. So the only consequence of
* reading stale information here is that old pages might hang around a
* while longer before being reclaimed. That's actually good anyway, because
* the longer we delay reclaiming old pages, the more likely it is that a
* string of contiguous pages can be found after coalescing old pages from
* many old transactions together. */
/* The actual reader record, with cacheline padding. */
typedef struct reader_slot {
/* Current Transaction ID when this transaction began, or INVALID_TXNID.
* Multiple readers that start at the same time will probably have the
* same ID here. Again, it's not important to exclude them from
* anything; all we need to know is which version of the DB they
* started from so we can avoid overwriting any data used in that
* particular version. */
atomic_txnid_t txnid;
/* The information we store in a single slot of the reader table.
* In addition to a transaction ID, we also record the process and
* thread ID that owns a slot, so that we can detect stale information,
* e.g. threads or processes that went away without cleaning up.
*
* NOTE: We currently don't check for stale records.
* We simply re-init the table when we know that we're the only process
* opening the lock file. */
/* The thread ID of the thread owning this txn. */
mdbx_atomic_uint64_t tid;
/* The process ID of the process owning this reader txn. */
mdbx_atomic_uint32_t pid;
/* The number of pages used in the reader's MVCC snapshot,
* i.e. the value of meta->geometry.first_unallocated and
* txn->geo.first_unallocated */
atomic_pgno_t snapshot_pages_used;
/* Number of retired pages at the time this reader starts transaction. So,
* at any time the difference meta.pages_retired -
* reader.snapshot_pages_retired will give the number of pages which this
* reader restraining from reuse. */
mdbx_atomic_uint64_t snapshot_pages_retired;
} reader_slot_t;
/* The header for the reader table (a memory-mapped lock file). */
typedef struct shared_lck {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
uint64_t magic_and_version;
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
uint32_t os_and_format;
/* Flags which environment was opened. */
mdbx_atomic_uint32_t envmode;
/* Threshold of un-synced-with-disk pages for auto-sync feature,
* zero means no-threshold, i.e. auto-sync is disabled. */
atomic_pgno_t autosync_threshold;
/* Low 32-bit of txnid with which meta-pages was synced,
* i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3)
#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8)
#define MDBX_NOMETASYNC_LAZY_WRITEMAP \
(MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8)
mdbx_atomic_uint32_t meta_sync_txnid;
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
* the mti_unsynced_timeout sets to the current_time + autosync_period.
* The time value is represented in a suitable system-dependent form, for
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
* Zero means timed auto-sync is disabled. */
mdbx_atomic_uint64_t autosync_period;
/* Marker to distinguish uniqueness of DB/CLK. */
mdbx_atomic_uint64_t bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mlcnt[0] - mlcnt[1]) > 0 means at least one process
* lock at least one page, so therefore madvise() could return EINVAL. */
mdbx_atomic_uint32_t mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
pgop_stat_t pgops;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_LOCKING > 0
/* Write transaction lock. */
osal_ipclock_t wrt_lock;
#endif /* MDBX_LOCKING > 0 */
atomic_txnid_t cached_oldest;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
mdbx_atomic_uint64_t eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
mdbx_atomic_uint64_t unsynced_pages;
/* Timestamp of the last readers check. */
mdbx_atomic_uint64_t readers_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t readahead_anchor;
/* Shared cache for mincore() results */
struct {
pgno_t begin[4];
uint64_t mask[4];
} mincore_cache;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_LOCKING > 0
/* Readeaders table lock. */
osal_ipclock_t rdt_lock;
#endif /* MDBX_LOCKING > 0 */
/* The number of slots that have been used in the reader table.
* This always records the maximum count, it is not decremented
* when readers release their slots. */
mdbx_atomic_uint32_t rdt_length;
mdbx_atomic_uint32_t rdt_refresh_flag;
#if FLEXIBLE_ARRAY_MEMBERS
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
reader_slot_t rdt[] /* dynamic size */;
/* Lockfile format signature: version, features and field layout */
#define MDBX_LOCK_FORMAT \
(MDBX_LCK_SIGN * 27733 + (unsigned)sizeof(reader_slot_t) * 13 + \
(unsigned)offsetof(reader_slot_t, snapshot_pages_used) * 251 + \
(unsigned)offsetof(lck_t, cached_oldest) * 83 + \
(unsigned)offsetof(lck_t, rdt_length) * 37 + \
(unsigned)offsetof(lck_t, rdt) * 29)
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} lck_t;
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
#define MDBX_READERS_LIMIT 32767

View File

@ -1,18 +1,9 @@
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#if !(defined(_WIN32) || defined(_WIN64)) /* !Windows LCK-implementation */
#if !(defined(_WIN32) || defined(_WIN64))
/*----------------------------------------------------------------------------*
* POSIX/non-Windows LCK-implementation */
#include "internals.h"
@ -20,112 +11,21 @@
#include <sys/sem.h>
#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */
/*----------------------------------------------------------------------------*/
/* global constructor/destructor */
#if defined(__linux__) || defined(__gnu_linux__)
#include <sys/utsname.h>
MDBX_INTERNAL_VAR_INSTA uint32_t linux_kernel_version;
MDBX_INTERNAL_VAR_INSTA bool
mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
MDBX_EXCLUDE_FOR_GPROF
__cold static uint8_t probe_for_WSL(const char *tag) {
const char *const WSL = strstr(tag, "WSL");
if (WSL && WSL[3] >= '2' && WSL[3] <= '9')
return WSL[3] - '0';
const char *const wsl = strstr(tag, "wsl");
if (wsl && wsl[3] >= '2' && wsl[3] <= '9')
return wsl[3] - '0';
if (WSL || wsl || strcasestr(tag, "Microsoft"))
/* Expecting no new kernel within WSL1, either it will explicitly
* marked by an appropriate WSL-version hint. */
return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2;
return 0;
}
#endif /* Linux */
#ifdef ENABLE_GPROF
extern void _mcleanup(void);
extern void monstartup(unsigned long, unsigned long);
extern void _init(void);
extern void _fini(void);
extern void __gmon_start__(void) __attribute__((__weak__));
#endif /* ENABLE_GPROF */
MDBX_EXCLUDE_FOR_GPROF
__cold static __attribute__((__constructor__)) void
mdbx_global_constructor(void) {
#ifdef ENABLE_GPROF
if (!&__gmon_start__)
monstartup((uintptr_t)&_init, (uintptr_t)&_fini);
#endif /* ENABLE_GPROF */
#if defined(__linux__) || defined(__gnu_linux__)
struct utsname buffer;
if (uname(&buffer) == 0) {
int i = 0;
char *p = buffer.release;
while (*p && i < 4) {
if (*p >= '0' && *p <= '9') {
long number = strtol(p, &p, 10);
if (number > 0) {
if (number > 255)
number = 255;
linux_kernel_version += number << (24 - i * 8);
}
++i;
} else {
++p;
}
}
/* "Official" way of detecting WSL1 but not WSL2
* https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364
*
* WARNING: False negative detection of WSL1 will result in DATA LOSS!
* So, the REQUIREMENTS for this code:
* 1. MUST detect WSL1 without false-negatives.
* 2. DESIRABLE detect WSL2 but without the risk of violating the first. */
mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 ||
probe_for_WSL(buffer.sysname) == 1 ||
probe_for_WSL(buffer.release) == 1;
}
#endif /* Linux */
global_ctor();
}
MDBX_EXCLUDE_FOR_GPROF
__cold static __attribute__((__destructor__)) void
mdbx_global_destructor(void) {
global_dtor();
#ifdef ENABLE_GPROF
if (!&__gmon_start__)
_mcleanup();
#endif /* ENABLE_GPROF */
}
/*----------------------------------------------------------------------------*/
/* lck */
/* Описание реализации блокировок для POSIX & Linux:
*
* lck-файл отображается в память, в нём организуется таблица читателей и
* размещаются совместно используемые posix-мьютексы (futex). Посредством
* этих мьютексов (см struct MDBX_lockinfo) реализуются:
* этих мьютексов (см struct lck_t) реализуются:
* - Блокировка таблицы читателей для регистрации,
* т.е. функции osal_rdt_lock() и osal_rdt_unlock().
* т.е. функции lck_rdt_lock() и lck_rdt_unlock().
* - Блокировка БД для пишущих транзакций,
* т.е. функции osal_txn_lock() и osal_txn_unlock().
* т.е. функции lck_txn_lock() и lck_txn_unlock().
*
* Остальной функционал реализуется отдельно посредством файловых блокировок:
* - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
* в операционный режим, функции osal_lck_seize() и osal_lck_downgrade().
* в операционный режим, функции lck_seize() и lck_downgrade().
* - Проверка присутствие процессов-читателей,
* т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check().
* т.е. функции lck_rpid_set(), lck_rpid_clear() и lck_rpid_check().
*
* Для блокировки файлов используется fcntl(F_SETLK), так как:
* - lockf() оперирует только эксклюзивной блокировкой и требует
@ -169,9 +69,9 @@ mdbx_global_destructor(void) {
static int op_setlk, op_setlkw, op_getlk;
__cold static void choice_fcntl(void) {
assert(!op_setlk && !op_setlkw && !op_getlk);
if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0
if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0
#if defined(__linux__) || defined(__gnu_linux__)
&& linux_kernel_version >
&& globals.linux_kernel_version >
0x030f0000 /* OFD locks are available since 3.15, but engages here
only for 3.16 and later kernels (i.e. LTS) because
of reliability reasons */
@ -201,7 +101,6 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck,
"The bitness of system `off_t` type is mismatch. Please "
"fix build and/or NDK configuration.");
#endif /* Android */
jitter4testing(true);
assert(offset >= 0 && len > 0);
assert((uint64_t)offset < (uint64_t)INT64_MAX &&
(uint64_t)len < (uint64_t)INT64_MAX &&
@ -213,6 +112,8 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck,
assert((uint64_t)((off_t)((uint64_t)offset + (uint64_t)len)) ==
((uint64_t)offset + (uint64_t)len));
jitter4testing(true);
for (;;) {
MDBX_STRUCT_FLOCK lock_op;
STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) &&
@ -262,7 +163,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck,
}
}
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
#if MDBX_USE_OFDLOCKS
if (unlikely(op_setlk == 0))
choice_fcntl();
@ -270,30 +171,30 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX);
}
MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0);
if (unlikely(osal_getpid() != env->me_pid))
MDBX_INTERNAL int lck_rpid_set(MDBX_env *env) {
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
assert(env->pid > 0);
if (unlikely(osal_getpid() != env->pid))
return MDBX_PANIC;
return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1);
return lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, env->pid, 1);
}
MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0);
return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1);
MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env) {
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
assert(env->pid > 0);
return lck_op(env->lck_mmap.fd, op_setlk, F_UNLCK, env->pid, 1);
}
MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid) {
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
assert(pid > 0);
return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1);
return lck_op(env->lck_mmap.fd, op_getlk, F_WRLCK, pid, 1);
}
/*---------------------------------------------------------------------------*/
#if MDBX_LOCKING > MDBX_LOCKING_SYSV
MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc) {
MDBX_INTERNAL int lck_ipclock_stubinit(osal_ipclock_t *ipc) {
#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
return sem_init(ipc, false, 1) ? errno : 0;
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
@ -304,7 +205,7 @@ MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc) {
#endif
}
MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) {
MDBX_INTERNAL int lck_ipclock_destroy(osal_ipclock_t *ipc) {
#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
return sem_destroy(ipc) ? errno : 0;
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
@ -320,7 +221,7 @@ static int check_fstat(MDBX_env *env) {
struct stat st;
int rc = MDBX_SUCCESS;
if (fstat(env->me_lazy_fd, &st)) {
if (fstat(env->lazy_fd, &st)) {
rc = errno;
ERROR("fstat(%s), err %d", "DXB", rc);
return rc;
@ -345,7 +246,7 @@ static int check_fstat(MDBX_env *env) {
//----------------------------------------------------------------------------
if (fstat(env->me_lfd, &st)) {
if (fstat(env->lck_mmap.fd, &st)) {
rc = errno;
ERROR("fstat(%s), err %d", "LCK", rc);
return rc;
@ -363,8 +264,8 @@ static int check_fstat(MDBX_env *env) {
}
/* Checking file size for detect the situation when we got the shared lock
* immediately after osal_lck_destroy(). */
if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) {
* immediately after lck_destroy(). */
if (st.st_size < (off_t)(sizeof(lck_t) + sizeof(reader_slot_t))) {
VERBOSE("lck-file is too short (%u), exclusive-lock needed",
(unsigned)st.st_size);
rc = MDBX_RESULT_TRUE;
@ -373,18 +274,14 @@ static int check_fstat(MDBX_env *env) {
return rc;
}
__cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
if (unlikely(osal_getpid() != env->me_pid))
__cold MDBX_INTERNAL int lck_seize(MDBX_env *env) {
assert(env->lazy_fd != INVALID_HANDLE_VALUE);
if (unlikely(osal_getpid() != env->pid))
return MDBX_PANIC;
#if MDBX_USE_OFDLOCKS
if (unlikely(op_setlk == 0))
choice_fcntl();
#endif /* MDBX_USE_OFDLOCKS */
int rc = MDBX_SUCCESS;
#if defined(__linux__) || defined(__gnu_linux__)
if (unlikely(mdbx_RunningOnWSL1)) {
if (unlikely(globals.running_on_WSL1)) {
rc = ENOLCK /* No record locks available */;
ERROR("%s, err %u",
"WSL1 (Windows Subsystem for Linux) is mad and trouble-full, "
@ -394,11 +291,15 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
}
#endif /* Linux */
if (env->me_lfd == INVALID_HANDLE_VALUE) {
#if MDBX_USE_OFDLOCKS
if (unlikely(op_setlk == 0))
choice_fcntl();
#endif /* MDBX_USE_OFDLOCKS */
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
rc =
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
rc = lck_op(env->lazy_fd, op_setlk,
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
if (rc != MDBX_SUCCESS) {
ERROR("%s, err %u", "without-lck", rc);
eASSERT(env, MDBX_IS_ERROR(rc));
@ -412,7 +313,7 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
retry:
if (rc == MDBX_RESULT_TRUE) {
rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1);
rc = lck_op(env->lck_mmap.fd, op_setlk, F_UNLCK, 0, 1);
if (rc != MDBX_SUCCESS) {
ERROR("%s, err %u", "unlock-before-retry", rc);
eASSERT(env, MDBX_IS_ERROR(rc));
@ -421,16 +322,15 @@ retry:
}
/* Firstly try to get exclusive locking. */
rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
rc = lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, 1);
if (rc == MDBX_SUCCESS) {
rc = check_fstat(env);
if (MDBX_IS_ERROR(rc))
return rc;
continue_dxb_exclusive:
rc =
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
rc = lck_op(env->lazy_fd, op_setlk,
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
if (rc == MDBX_SUCCESS)
return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
@ -455,16 +355,16 @@ retry:
}
/* Here could be one of two:
* - osal_lck_destroy() from the another process was hold the lock
* - lck_destroy() from the another process was hold the lock
* during a destruction.
* - either osal_lck_seize() from the another process was got the exclusive
* - either lck_seize() from the another process was got the exclusive
* lock and doing initialization.
* For distinguish these cases will use size of the lck-file later. */
/* Wait for lck-shared now. */
/* Here may be await during transient processes, for instance until another
* competing process doesn't call lck_downgrade(). */
rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1);
rc = lck_op(env->lck_mmap.fd, op_setlkw, F_RDLCK, 0, 1);
if (rc != MDBX_SUCCESS) {
ERROR("%s, err %u", "try-shared", rc);
eASSERT(env, MDBX_IS_ERROR(rc));
@ -480,7 +380,7 @@ retry:
}
/* got shared, retry exclusive */
rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
rc = lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, 1);
if (rc == MDBX_SUCCESS)
goto continue_dxb_exclusive;
@ -492,9 +392,8 @@ retry:
}
/* Lock against another process operating in without-lck or exclusive mode. */
rc =
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1);
rc = lck_op(env->lazy_fd, op_setlk,
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->pid, 1);
if (rc != MDBX_SUCCESS) {
ERROR("%s, err %u", "lock-against-without-lck", rc);
eASSERT(env, MDBX_IS_ERROR(rc));
@ -505,20 +404,20 @@ retry:
return MDBX_RESULT_FALSE;
}
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
if (unlikely(osal_getpid() != env->me_pid))
MDBX_INTERNAL int lck_downgrade(MDBX_env *env) {
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
if (unlikely(osal_getpid() != env->pid))
return MDBX_PANIC;
int rc = MDBX_SUCCESS;
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid);
if ((env->flags & MDBX_EXCLUSIVE) == 0) {
rc = lck_op(env->lazy_fd, op_setlk, F_UNLCK, 0, env->pid);
if (rc == MDBX_SUCCESS)
rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1,
OFF_T_MAX - env->me_pid - 1);
rc = lck_op(env->lazy_fd, op_setlk, F_UNLCK, env->pid + 1,
OFF_T_MAX - env->pid - 1);
}
if (rc == MDBX_SUCCESS)
rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1);
rc = lck_op(env->lck_mmap.fd, op_setlk, F_RDLCK, 0, 1);
if (unlikely(rc != 0)) {
ERROR("%s, err %u", "lck", rc);
assert(MDBX_IS_ERROR(rc));
@ -526,25 +425,24 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
return rc;
}
MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
if (unlikely(osal_getpid() != env->me_pid))
MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait) {
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
if (unlikely(osal_getpid() != env->pid))
return MDBX_PANIC;
const int cmd = dont_wait ? op_setlk : op_setlkw;
int rc = lck_op(env->me_lfd, cmd, F_WRLCK, 0, 1);
if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_EXCLUSIVE) == 0) {
rc = (env->me_pid > 1)
? lck_op(env->me_lazy_fd, cmd, F_WRLCK, 0, env->me_pid - 1)
: MDBX_SUCCESS;
int rc = lck_op(env->lck_mmap.fd, cmd, F_WRLCK, 0, 1);
if (rc == MDBX_SUCCESS && (env->flags & MDBX_EXCLUSIVE) == 0) {
rc = (env->pid > 1) ? lck_op(env->lazy_fd, cmd, F_WRLCK, 0, env->pid - 1)
: MDBX_SUCCESS;
if (rc == MDBX_SUCCESS) {
rc = lck_op(env->me_lazy_fd, cmd, F_WRLCK, env->me_pid + 1,
OFF_T_MAX - env->me_pid - 1);
if (rc != MDBX_SUCCESS && env->me_pid > 1 &&
lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid - 1))
rc = lck_op(env->lazy_fd, cmd, F_WRLCK, env->pid + 1,
OFF_T_MAX - env->pid - 1);
if (rc != MDBX_SUCCESS && env->pid > 1 &&
lck_op(env->lazy_fd, op_setlk, F_UNLCK, 0, env->pid - 1))
rc = MDBX_PANIC;
}
if (rc != MDBX_SUCCESS && lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1))
if (rc != MDBX_SUCCESS && lck_op(env->lck_mmap.fd, op_setlk, F_RDLCK, 0, 1))
rc = MDBX_PANIC;
}
if (unlikely(rc != 0)) {
@ -554,48 +452,48 @@ MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
return rc;
}
__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor,
const uint32_t current_pid) {
__cold MDBX_INTERNAL int lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor,
const uint32_t current_pid) {
eASSERT(env, osal_getpid() == current_pid);
int rc = MDBX_SUCCESS;
struct stat lck_info;
MDBX_lockinfo *lck = env->me_lck;
if (lck && lck == env->me_lck_mmap.lck && !inprocess_neighbor &&
lck_t *lck = env->lck;
if (lck && lck == env->lck_mmap.lck && !inprocess_neighbor &&
/* try get exclusive access */
lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
/* if LCK was not removed */
fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 &&
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
fstat(env->lck_mmap.fd, &lck_info) == 0 && lck_info.st_nlink > 0 &&
lck_op(env->lazy_fd, op_setlk,
(env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
OFF_T_MAX) == 0) {
VERBOSE("%p got exclusive, drown ipc-locks", (void *)env);
eASSERT(env, current_pid == env->me_pid);
eASSERT(env, current_pid == env->pid);
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
if (env->me_sysv_ipc.semid != -1)
rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0;
#else
rc = osal_ipclock_destroy(&lck->mti_rlock);
rc = lck_ipclock_destroy(&lck->rdt_lock);
if (rc == 0)
rc = osal_ipclock_destroy(&lck->mti_wlock);
rc = lck_ipclock_destroy(&lck->wrt_lock);
#endif /* MDBX_LOCKING */
eASSERT(env, rc == 0);
if (rc == 0) {
const bool synced = lck->mti_unsynced_pages.weak == 0;
osal_munmap(&env->me_lck_mmap);
if (synced && env->me_lfd != INVALID_HANDLE_VALUE)
rc = ftruncate(env->me_lfd, 0) ? errno : 0;
const bool synced = lck->unsynced_pages.weak == 0;
osal_munmap(&env->lck_mmap);
if (synced && env->lck_mmap.fd != INVALID_HANDLE_VALUE)
rc = ftruncate(env->lck_mmap.fd, 0) ? errno : 0;
}
jitter4testing(false);
}
if (current_pid != env->me_pid) {
if (current_pid != env->pid) {
eASSERT(env, !inprocess_neighbor);
NOTICE("drown env %p after-fork pid %d -> %d",
__Wpedantic_format_voidptr(env), env->me_pid, current_pid);
__Wpedantic_format_voidptr(env), env->pid, current_pid);
inprocess_neighbor = nullptr;
}
@ -607,57 +505,55 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
* locks should be released here explicitly with properly order. */
/* close dxb and restore lock */
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS)
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->dsync_fd) != 0) && rc == MDBX_SUCCESS)
rc = errno;
env->me_dsync_fd = INVALID_HANDLE_VALUE;
env->dsync_fd = INVALID_HANDLE_VALUE;
}
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS)
if (env->lazy_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->lazy_fd) != 0) && rc == MDBX_SUCCESS)
rc = errno;
env->me_lazy_fd = INVALID_HANDLE_VALUE;
env->lazy_fd = INVALID_HANDLE_VALUE;
if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-lock */
rc = lck_op(
inprocess_neighbor->me_lazy_fd, F_SETLKW,
(inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
(inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)
? 0
: inprocess_neighbor->me_pid,
(inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1);
rc = lck_op(inprocess_neighbor->lazy_fd, F_SETLKW,
(inprocess_neighbor->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
(inprocess_neighbor->flags & MDBX_EXCLUSIVE)
? 0
: inprocess_neighbor->pid,
(inprocess_neighbor->flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1);
}
}
/* close clk and restore locks */
if (env->me_lfd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS)
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->lck_mmap.fd) != 0) && rc == MDBX_SUCCESS)
rc = errno;
env->me_lfd = INVALID_HANDLE_VALUE;
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-locks */
rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1);
if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader)
rc = osal_rpid_set(inprocess_neighbor);
rc = lck_op(inprocess_neighbor->lck_mmap.fd, F_SETLKW, F_RDLCK, 0, 1);
if (rc == MDBX_SUCCESS && inprocess_neighbor->registered_reader_pid)
rc = lck_rpid_set(inprocess_neighbor);
}
}
if (inprocess_neighbor && rc != MDBX_SUCCESS)
inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
inprocess_neighbor->flags |= ENV_FATAL_ERROR;
return rc;
}
/*---------------------------------------------------------------------------*/
__cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
MDBX_env *inprocess_neighbor,
int global_uniqueness_flag) {
__cold MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor,
int global_uniqueness_flag) {
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
int semid = -1;
/* don't initialize semaphores twice */
(void)inprocess_neighbor;
if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
struct stat st;
if (fstat(env->me_lazy_fd, &st))
if (fstat(env->lazy_fd, &st))
return errno;
sysv_retry_create:
semid = semget(env->me_sysv_ipc.key, 2,
@ -711,9 +607,9 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
/* don't initialize semaphores twice */
(void)inprocess_neighbor;
if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
if (sem_init(&env->me_lck_mmap.lck->mti_rlock, true, 1))
if (sem_init(&env->lck_mmap.lck->rdt_lock, true, 1))
return errno;
if (sem_init(&env->me_lck_mmap.lck->mti_wlock, true, 1))
if (sem_init(&env->lck_mmap.lck->wrt_lock, true, 1))
return errno;
}
return MDBX_SUCCESS;
@ -782,10 +678,10 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
if (rc && rc != ENOTSUP)
goto bailout;
rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_rlock, &ma);
rc = pthread_mutex_init(&env->lck_mmap.lck->rdt_lock, &ma);
if (rc)
goto bailout;
rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_wlock, &ma);
rc = pthread_mutex_init(&env->lck_mmap.lck->wrt_lock, &ma);
bailout:
pthread_mutexattr_destroy(&ma);
@ -799,23 +695,27 @@ __cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc,
const int err) {
int rc = err;
#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV
#ifndef EOWNERDEAD
#define EOWNERDEAD MDBX_RESULT_TRUE
#endif /* EOWNERDEAD */
if (err == EOWNERDEAD) {
/* We own the mutex. Clean up after dead previous owner. */
const bool rlocked = ipc == &env->me_lck->mti_rlock;
const bool rlocked = ipc == &env->lck->rdt_lock;
rc = MDBX_SUCCESS;
if (!rlocked) {
if (unlikely(env->me_txn)) {
if (unlikely(env->txn)) {
/* env is hosed if the dead thread was ours */
env->me_flags |= MDBX_FATAL_ERROR;
env->me_txn = NULL;
env->flags |= ENV_FATAL_ERROR;
env->txn = nullptr;
rc = MDBX_PANIC;
}
}
WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'),
(rc ? "this process' env is hosed" : "recovering"));
int check_rc = cleanup_dead_readers(env, rlocked, NULL);
int check_rc = mvcc_cleanup_dead(env, rlocked, nullptr);
check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
@ -858,12 +758,12 @@ __cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc,
ERROR("mutex (un)lock failed, %s", mdbx_strerror(err));
if (rc != EDEADLK)
env->me_flags |= MDBX_FATAL_ERROR;
env->flags |= ENV_FATAL_ERROR;
return rc;
}
#if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC)
MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) {
MDBX_INTERNAL int osal_check_tid4bionic(void) {
/* avoid 32-bit Bionic bug/hang with 32-pit TID */
if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) {
pid_t tid = gettid();
@ -900,7 +800,7 @@ static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc,
} else if (sem_wait(ipc))
rc = errno;
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
struct sembuf op = {.sem_num = (ipc != &env->lck->wrt_lock),
.sem_op = -1,
.sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO};
int rc;
@ -910,7 +810,7 @@ static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc,
rc = MDBX_BUSY;
} else {
rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS;
*ipc = env->me_pid;
*ipc = env->pid;
}
#else
#error "FIXME"
@ -929,11 +829,11 @@ int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) {
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
err = sem_post(ipc) ? errno : MDBX_SUCCESS;
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
if (unlikely(*ipc != (pid_t)env->me_pid))
if (unlikely(*ipc != (pid_t)env->pid))
err = EPERM;
else {
*ipc = 0;
struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
struct sembuf op = {.sem_num = (ipc != &env->lck->wrt_lock),
.sem_op = 1,
.sem_flg = SEM_UNDO};
err = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS;
@ -944,66 +844,61 @@ int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) {
int rc = err;
if (unlikely(rc != MDBX_SUCCESS)) {
const uint32_t current_pid = osal_getpid();
if (current_pid == env->me_pid || LOG_ENABLED(MDBX_LOG_NOTICE))
debug_log((current_pid == env->me_pid)
if (current_pid == env->pid || LOG_ENABLED(MDBX_LOG_NOTICE))
debug_log((current_pid == env->pid)
? MDBX_LOG_FATAL
: (rc = MDBX_SUCCESS, MDBX_LOG_NOTICE),
"ipc-unlock()", __LINE__, "failed: env %p, lck-%s %p, err %d\n",
__Wpedantic_format_voidptr(env),
(env->me_lck == env->me_lck_mmap.lck) ? "mmap" : "stub",
__Wpedantic_format_voidptr(env->me_lck), err);
(env->lck == env->lck_mmap.lck) ? "mmap" : "stub",
__Wpedantic_format_voidptr(env->lck), err);
}
return rc;
}
MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env) {
TRACE("%s", ">>");
jitter4testing(true);
int rc = osal_ipclock_lock(env, &env->me_lck->mti_rlock, false);
int rc = osal_ipclock_lock(env, &env->lck->rdt_lock, false);
TRACE("<< rc %d", rc);
return rc;
}
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env) {
TRACE("%s", ">>");
int err = osal_ipclock_unlock(env, &env->me_lck->mti_rlock);
int err = osal_ipclock_unlock(env, &env->lck->rdt_lock);
TRACE("<< err %d", err);
if (unlikely(err != MDBX_SUCCESS))
mdbx_panic("%s() failed: err %d\n", __func__, err);
jitter4testing(true);
}
int osal_txn_lock(MDBX_env *env, bool dont_wait) {
int lck_txn_lock(MDBX_env *env, bool dont_wait) {
TRACE("%swait %s", dont_wait ? "dont-" : "", ">>");
jitter4testing(true);
const int err = osal_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait);
const int err = osal_ipclock_lock(env, &env->lck->wrt_lock, dont_wait);
int rc = err;
if (likely(!MDBX_IS_ERROR(err))) {
eASSERT(env, !env->me_txn0->mt_owner ||
eASSERT(env, !env->basal_txn->owner ||
err == /* если другой поток в этом-же процессе завершился
не освободив блокировку */
MDBX_RESULT_TRUE);
env->me_txn0->mt_owner = osal_thread_self();
env->basal_txn->owner = osal_thread_self();
rc = MDBX_SUCCESS;
}
TRACE("<< err %d, rc %d", err, rc);
return rc;
}
void osal_txn_unlock(MDBX_env *env) {
void lck_txn_unlock(MDBX_env *env) {
TRACE("%s", ">>");
eASSERT(env, env->me_txn0->mt_owner == osal_thread_self());
env->me_txn0->mt_owner = 0;
int err = osal_ipclock_unlock(env, &env->me_lck->mti_wlock);
eASSERT(env, env->basal_txn->owner == osal_thread_self());
env->basal_txn->owner = 0;
int err = osal_ipclock_unlock(env, &env->lck->wrt_lock);
TRACE("<< err %d", err);
if (unlikely(err != MDBX_SUCCESS))
mdbx_panic("%s() failed: err %d\n", __func__, err);
jitter4testing(true);
}
#else
#ifdef _MSC_VER
#pragma warning(disable : 4206) /* nonstandard extension used: translation \
unit is empty */
#endif /* _MSC_VER (warnings) */
#endif /* !Windows LCK-implementation */
#endif /* !Windows LCK-implementation */

View File

@ -1,18 +1,7 @@
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#if defined(_WIN32) || defined(_WIN64) /* Windows LCK-implementation */
#if defined(_WIN32) || defined(_WIN64)
/* PREAMBLE FOR WINDOWS:
*
@ -22,91 +11,6 @@
#include "internals.h"
static void mdbx_winnt_import(void);
#if MDBX_BUILD_SHARED_LIBRARY
#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG)
/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks.
*
* Define dll's entry point only for Release build when NDEBUG is defined and
* MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will
* automatically use DllMainCRTStartup() from CRT library, which also
* automatically call DllMain() from our mdbx.dll */
#pragma comment(linker, "/ENTRY:DllMain")
#endif /* MDBX_WITHOUT_MSVC_CRT */
BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
#else
#if !MDBX_MANUAL_MODULE_HANDLER
static
#endif /* !MDBX_MANUAL_MODULE_HANDLER */
void NTAPI
mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
#endif /* MDBX_BUILD_SHARED_LIBRARY */
{
(void)reserved;
switch (reason) {
case DLL_PROCESS_ATTACH:
mdbx_winnt_import();
global_ctor();
break;
case DLL_PROCESS_DETACH:
global_dtor();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
thread_dtor(module);
break;
}
#if MDBX_BUILD_SHARED_LIBRARY
return TRUE;
#endif
}
#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER
/* *INDENT-OFF* */
/* clang-format off */
#if defined(_MSC_VER)
# pragma const_seg(push)
# pragma data_seg(push)
# ifndef _M_IX86
/* kick a linker to create the TLS directory if not already done */
# pragma comment(linker, "/INCLUDE:_tls_used")
/* Force some symbol references. */
# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor")
/* specific const-segment for WIN64 */
# pragma const_seg(".CRT$XLB")
const
# else
/* kick a linker to create the TLS directory if not already done */
# pragma comment(linker, "/INCLUDE:__tls_used")
/* Force some symbol references. */
# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor")
/* specific data-segment for WIN32 */
# pragma data_seg(".CRT$XLB")
# endif
__declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler;
# pragma data_seg(pop)
# pragma const_seg(pop)
#elif defined(__GNUC__)
# ifndef _M_IX86
const
# endif
PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler;
#else
# error FIXME
#endif
/* *INDENT-ON* */
/* clang-format on */
#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */
/*----------------------------------------------------------------------------*/
#define LCK_SHARED 0
#define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK
#define LCK_WAITFOR 0
@ -145,17 +49,16 @@ static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags,
return (int)rc;
}
static __inline int flock(HANDLE fd, unsigned flags, size_t offset,
size_t bytes) {
static inline int flock(HANDLE fd, unsigned flags, size_t offset,
size_t bytes) {
return flock_with_event(fd, 0, flags, offset, bytes);
}
static __inline int flock_data(const MDBX_env *env, unsigned flags,
size_t offset, size_t bytes) {
static inline int flock_data(const MDBX_env *env, unsigned flags, size_t offset,
size_t bytes) {
const HANDLE fd4data =
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
return flock_with_event(fd4data, env->me_data_lock_event, flags, offset,
bytes);
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
return flock_with_event(fd4data, env->dxb_lock_event, flags, offset, bytes);
}
static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) {
@ -175,16 +78,16 @@ static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) {
#else
#define DXB_MAXLEN UINT32_C(0x7ff00000)
#endif
#define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN
#define DXB_BODY (env->ps * (size_t)NUM_METAS), DXB_MAXLEN
#define DXB_WHOLE 0, DXB_MAXLEN
int osal_txn_lock(MDBX_env *env, bool dontwait) {
int lck_txn_lock(MDBX_env *env, bool dontwait) {
if (dontwait) {
if (!TryEnterCriticalSection(&env->me_windowsbug_lock))
if (!TryEnterCriticalSection(&env->windowsbug_lock))
return MDBX_BUSY;
} else {
__try {
EnterCriticalSection(&env->me_windowsbug_lock);
EnterCriticalSection(&env->windowsbug_lock);
}
__except ((GetExceptionCode() ==
0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
@ -194,93 +97,93 @@ int osal_txn_lock(MDBX_env *env, bool dontwait) {
}
}
eASSERT(env, !env->me_txn0->mt_owner);
if (env->me_flags & MDBX_EXCLUSIVE)
eASSERT(env, !env->basal_txn->owner);
if (env->flags & MDBX_EXCLUSIVE)
goto done;
const HANDLE fd4data =
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
int rc = flock_with_event(fd4data, env->me_data_lock_event,
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
int rc = flock_with_event(fd4data, env->dxb_lock_event,
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
: (LCK_EXCLUSIVE | LCK_WAITFOR),
DXB_BODY);
if (rc == ERROR_LOCK_VIOLATION && dontwait) {
SleepEx(0, true);
rc = flock_with_event(fd4data, env->me_data_lock_event,
rc = flock_with_event(fd4data, env->dxb_lock_event,
LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY);
if (rc == ERROR_LOCK_VIOLATION) {
SleepEx(0, true);
rc = flock_with_event(fd4data, env->me_data_lock_event,
rc = flock_with_event(fd4data, env->dxb_lock_event,
LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY);
}
}
if (rc == MDBX_SUCCESS) {
done:
/* Zap: Failing to release lock 'env->me_windowsbug_lock'
/* Zap: Failing to release lock 'env->windowsbug_lock'
* in function 'mdbx_txn_lock' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115);
env->me_txn0->mt_owner = osal_thread_self();
env->basal_txn->owner = osal_thread_self();
return MDBX_SUCCESS;
}
LeaveCriticalSection(&env->me_windowsbug_lock);
LeaveCriticalSection(&env->windowsbug_lock);
return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
}
void osal_txn_unlock(MDBX_env *env) {
eASSERT(env, env->me_txn0->mt_owner == osal_thread_self());
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
void lck_txn_unlock(MDBX_env *env) {
eASSERT(env, env->basal_txn->owner == osal_thread_self());
if ((env->flags & MDBX_EXCLUSIVE) == 0) {
const HANDLE fd4data =
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
int err = funlock(fd4data, DXB_BODY);
if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, err);
}
env->me_txn0->mt_owner = 0;
LeaveCriticalSection(&env->me_windowsbug_lock);
env->basal_txn->owner = 0;
LeaveCriticalSection(&env->windowsbug_lock);
}
/*----------------------------------------------------------------------------*/
/* global `read` lock for readers registration,
* exclusive locking `mti_numreaders` (second) cacheline */
* exclusive locking `rdt_length` (second) cacheline */
#define LCK_LO_OFFSET 0
#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders)
#define LCK_LO_LEN offsetof(lck_t, rdt_length)
#define LCK_UP_OFFSET LCK_LO_LEN
#define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET)
#define LCK_UP_LEN (sizeof(lck_t) - LCK_UP_OFFSET)
#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN
#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
osal_srwlock_AcquireShared(&env->me_remap_guard);
if (env->me_lfd == INVALID_HANDLE_VALUE)
MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env) {
imports.srwl_AcquireShared(&env->remap_guard);
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE)
return MDBX_SUCCESS; /* readonly database in readonly filesystem */
/* transition from S-? (used) to S-E (locked),
* e.g. exclusive lock upper-part */
if (env->me_flags & MDBX_EXCLUSIVE)
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS;
int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
int rc = flock(env->lck_mmap.fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
if (rc == MDBX_SUCCESS)
return MDBX_SUCCESS;
osal_srwlock_ReleaseShared(&env->me_remap_guard);
imports.srwl_ReleaseShared(&env->remap_guard);
return rc;
}
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
if (env->me_lfd != INVALID_HANDLE_VALUE &&
(env->me_flags & MDBX_EXCLUSIVE) == 0) {
MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env) {
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE &&
(env->flags & MDBX_EXCLUSIVE) == 0) {
/* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */
int err = funlock(env->me_lfd, LCK_UPPER);
int err = funlock(env->lck_mmap.fd, LCK_UPPER);
if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, err);
}
osal_srwlock_ReleaseShared(&env->me_remap_guard);
imports.srwl_ReleaseShared(&env->remap_guard);
}
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
return flock(
fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0,
DXB_MAXLEN);
@ -293,7 +196,7 @@ static int suspend_and_append(mdbx_handle_array_t **array,
mdbx_handle_array_t *const ptr =
osal_realloc((limit > ARRAY_LENGTH((*array)->handles))
? *array
: /* don't free initial array on the stack */ NULL,
: /* don't free initial array on the stack */ nullptr,
sizeof(mdbx_handle_array_t) +
sizeof(HANDLE) * (limit * (size_t)2 -
ARRAY_LENGTH((*array)->handles)));
@ -307,7 +210,7 @@ static int suspend_and_append(mdbx_handle_array_t **array,
HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION,
FALSE, ThreadId);
if (hThread == NULL)
if (hThread == nullptr)
return (int)GetLastError();
if (SuspendThread(hThread) == (DWORD)-1) {
@ -324,28 +227,27 @@ static int suspend_and_append(mdbx_handle_array_t **array,
return MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC int
MDBX_INTERNAL int
osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
eASSERT(env, (env->me_flags & MDBX_NOSTICKYTHREADS) == 0);
eASSERT(env, (env->flags & MDBX_NOSTICKYTHREADS) == 0);
const uintptr_t CurrentTid = GetCurrentThreadId();
int rc;
if (env->me_lck_mmap.lck) {
if (env->lck_mmap.lck) {
/* Scan LCK for threads of the current process */
const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers;
const MDBX_reader *const end =
const reader_slot_t *const begin = env->lck_mmap.lck->rdt;
const reader_slot_t *const end =
begin +
atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, mo_AcquireRelease);
const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0;
for (const MDBX_reader *reader = begin; reader < end; ++reader) {
if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) {
atomic_load32(&env->lck_mmap.lck->rdt_length, mo_AcquireRelease);
const uintptr_t WriteTxnOwner = env->basal_txn ? env->basal_txn->owner : 0;
for (const reader_slot_t *reader = begin; reader < end; ++reader) {
if (reader->pid.weak != env->pid || !reader->tid.weak) {
skip_lck:
continue;
}
if (reader->mr_tid.weak == CurrentTid ||
reader->mr_tid.weak == WriteTxnOwner)
if (reader->tid.weak == CurrentTid || reader->tid.weak == WriteTxnOwner)
goto skip_lck;
rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak);
rc = suspend_and_append(array, (mdbx_tid_t)reader->tid.weak);
if (rc != MDBX_SUCCESS) {
bailout_lck:
(void)osal_resume_threads_after_remap(*array);
@ -360,7 +262,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
} else {
/* Without LCK (i.e. read-only mode).
* Walk through a snapshot of all running threads */
eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY));
eASSERT(env, env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY));
const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
if (hSnapshot == INVALID_HANDLE_VALUE)
return (int)GetLastError();
@ -377,7 +279,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
}
do {
if (entry.th32OwnerProcessID != env->me_pid ||
if (entry.th32OwnerProcessID != env->pid ||
entry.th32ThreadID == CurrentTid)
continue;
@ -396,8 +298,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
return MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
MDBX_INTERNAL int osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
int rc = MDBX_SUCCESS;
for (unsigned i = 0; i < array->count; ++i) {
const HANDLE hThread = array->handles[i];
@ -426,6 +327,7 @@ osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
* Only 6 states of FSM are used, which 2 of ones are transitive.
*
* States:
* LO HI
* ?-? = free, i.e. unlocked
* S-? = used, i.e. shared lock
* E-? = exclusive-read, i.e. operational exclusive
@ -436,39 +338,39 @@ osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
* E-S
* E-E = exclusive-write, i.e. exclusive due (re)initialization
*
* The osal_lck_seize() moves the locking-FSM from the initial free/unlocked
* The lck_seize() moves the locking-FSM from the initial free/unlocked
* state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible,
* or to the "used" (and returns MDBX_RESULT_FALSE).
*
* The osal_lck_downgrade() moves the locking-FSM from "exclusive write"
* The lck_downgrade() moves the locking-FSM from "exclusive write"
* state to the "used" (i.e. shared) state.
*
* The osal_lck_upgrade() moves the locking-FSM from "used" (i.e. shared)
* The lck_upgrade() moves the locking-FSM from "used" (i.e. shared)
* state to the "exclusive write" state.
*/
static void lck_unlock(MDBX_env *env) {
int err;
if (env->me_lfd != INVALID_HANDLE_VALUE) {
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
/* double `unlock` for robustly remove overlapped shared/exclusive locks */
do
err = funlock(env->me_lfd, LCK_LOWER);
err = funlock(env->lck_mmap.fd, LCK_LOWER);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
SetLastError(ERROR_SUCCESS);
do
err = funlock(env->me_lfd, LCK_UPPER);
err = funlock(env->lck_mmap.fd, LCK_UPPER);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
SetLastError(ERROR_SUCCESS);
}
const HANDLE fd4data =
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
if (fd4data != INVALID_HANDLE_VALUE) {
/* explicitly unlock to avoid latency for other processes (windows kernel
* releases such locks via deferred queues) */
@ -476,14 +378,14 @@ static void lck_unlock(MDBX_env *env) {
err = funlock(fd4data, DXB_BODY);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
SetLastError(ERROR_SUCCESS);
do
err = funlock(fd4data, DXB_WHOLE);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(globals.running_under_Wine && err == ERROR_LOCK_VIOLATION));
SetLastError(ERROR_SUCCESS);
}
}
@ -539,16 +441,16 @@ static int internal_seize_lck(HANDLE lfd) {
return rc;
}
MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
MDBX_INTERNAL int lck_seize(MDBX_env *env) {
const HANDLE fd4data =
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
assert(fd4data != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_RESULT_TRUE /* nope since files were must be opened
non-shareable */
;
if (env->me_lfd == INVALID_HANDLE_VALUE) {
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. on read-only filesystem) */
jitter4testing(false);
int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE);
@ -557,9 +459,9 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
return rc;
}
int rc = internal_seize_lck(env->me_lfd);
int rc = internal_seize_lck(env->lck_mmap.fd);
jitter4testing(false);
if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
if (rc == MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) {
/* Check that another process don't operates in without-lck mode.
* Doing such check by exclusive locking the body-part of db. Should be
* noted:
@ -583,24 +485,24 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
return rc;
}
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
MDBX_INTERNAL int lck_downgrade(MDBX_env *env) {
const HANDLE fd4data =
env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
/* Transite from exclusive-write state (E-E) to used (S-?) */
assert(fd4data != INVALID_HANDLE_VALUE);
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
;
/* 1) now at E-E (exclusive-write), transition to ?_E (middle) */
int rc = funlock(env->me_lfd, LCK_LOWER);
int rc = funlock(env->lck_mmap.fd, LCK_LOWER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__,
"E-E(exclusive-write) >> ?-E(middle)", rc);
/* 2) now at ?-E (middle), transition to S-E (locked) */
rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
rc = flock(env->lck_mmap.fd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
if (rc != MDBX_SUCCESS) {
/* 3) something went wrong, give up */;
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
@ -608,7 +510,7 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
}
/* 4) got S-E (locked), continue transition to S-? (used) */
rc = funlock(env->me_lfd, LCK_UPPER);
rc = funlock(env->lck_mmap.fd, LCK_UPPER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)",
rc);
@ -616,17 +518,17 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
return MDBX_SUCCESS /* 5) now at S-? (used), done */;
}
MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait) {
/* Transite from used state (S-?) to exclusive-write (E-E) */
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
;
/* 1) now on S-? (used), try S-E (locked) */
jitter4testing(false);
int rc = flock(env->me_lfd,
int rc = flock(env->lck_mmap.fd,
dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE,
LCK_UPPER);
if (rc != MDBX_SUCCESS) {
@ -636,14 +538,14 @@ MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
}
/* 3) now on S-E (locked), transition to ?-E (middle) */
rc = funlock(env->me_lfd, LCK_LOWER);
rc = funlock(env->lck_mmap.fd, LCK_LOWER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)",
rc);
/* 4) now on ?-E (middle), try E-E (exclusive-write) */
jitter4testing(false);
rc = flock(env->me_lfd,
rc = flock(env->lck_mmap.fd,
dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE,
LCK_LOWER);
if (rc != MDBX_SUCCESS) {
@ -655,25 +557,24 @@ MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) {
return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */;
}
MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
MDBX_env *inprocess_neighbor,
int global_uniqueness_flag) {
MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor,
int global_uniqueness_flag) {
(void)env;
(void)inprocess_neighbor;
(void)global_uniqueness_flag;
if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) {
if (imports.SetFileIoOverlappedRange && !(env->flags & MDBX_RDONLY)) {
HANDLE token = INVALID_HANDLE_VALUE;
TOKEN_PRIVILEGES privileges;
privileges.PrivilegeCount = 1;
privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES,
&token) ||
!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME,
!LookupPrivilegeValue(nullptr, SE_LOCK_MEMORY_NAME,
&privileges.Privileges[0].Luid) ||
!AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges),
nullptr, nullptr) ||
GetLastError() != ERROR_SUCCESS)
mdbx_SetFileIoOverlappedRange = NULL;
imports.SetFileIoOverlappedRange = nullptr;
if (token != INVALID_HANDLE_VALUE)
CloseHandle(token);
@ -681,21 +582,21 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
return MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor,
const uint32_t current_pid) {
MDBX_INTERNAL int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor,
const uint32_t current_pid) {
(void)current_pid;
/* LY: should unmap before releasing the locks to avoid race condition and
* STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */
if (env->me_map)
osal_munmap(&env->me_dxb_mmap);
if (env->me_lck_mmap.lck) {
const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0;
osal_munmap(&env->me_lck_mmap);
if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE &&
osal_lck_upgrade(env, true) == MDBX_SUCCESS)
if (env->dxb_mmap.base)
osal_munmap(&env->dxb_mmap);
if (env->lck_mmap.lck) {
const bool synced = env->lck_mmap.lck->unsynced_pages.weak == 0;
osal_munmap(&env->lck_mmap);
if (synced && !inprocess_neighbor &&
env->lck_mmap.fd != INVALID_HANDLE_VALUE &&
lck_upgrade(env, true) == MDBX_SUCCESS)
/* this will fail if LCK is used/mmapped by other process(es) */
osal_ftruncate(env->me_lfd, 0);
osal_ftruncate(env->lck_mmap.fd, 0);
}
lck_unlock(env);
return MDBX_SUCCESS;
@ -704,12 +605,12 @@ MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
/*----------------------------------------------------------------------------*/
/* reader checking (by pid) */
MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) {
MDBX_INTERNAL int lck_rpid_set(MDBX_env *env) {
(void)env;
return MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env) {
(void)env;
return MDBX_SUCCESS;
}
@ -720,7 +621,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
* MDBX_RESULT_TRUE, if pid is live (unable to acquire lock)
* MDBX_RESULT_FALSE, if pid is dead (lock acquired)
* or otherwise the errcode. */
MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid) {
(void)env;
HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid);
int rc;
@ -753,169 +654,4 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
}
}
//----------------------------------------------------------------------------
// Stub for slim read-write lock
// Copyright (C) 1995-2002 Brad Wilson
static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) {
srwl->readerCount = srwl->writerCount = 0;
}
static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) {
while (true) {
assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
// If there's a writer already, spin without unnecessarily
// interlocking the CPUs
if (srwl->writerCount != 0) {
SwitchToThread();
continue;
}
// Add to the readers list
_InterlockedIncrement(&srwl->readerCount);
// Check for writers again (we may have been preempted). If
// there are no writers writing or waiting, then we're done.
if (srwl->writerCount == 0)
break;
// Remove from the readers list, spin, try again
_InterlockedDecrement(&srwl->readerCount);
SwitchToThread();
}
}
static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) {
assert(srwl->readerCount > 0);
_InterlockedDecrement(&srwl->readerCount);
}
static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) {
while (true) {
assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
// If there's a writer already, spin without unnecessarily
// interlocking the CPUs
if (srwl->writerCount != 0) {
SwitchToThread();
continue;
}
// See if we can become the writer (expensive, because it inter-
// locks the CPUs, so writing should be an infrequent process)
if (_InterlockedExchange(&srwl->writerCount, 1) == 0)
break;
}
// Now we're the writer, but there may be outstanding readers.
// Spin until there aren't any more; new readers will wait now
// that we're the writer.
while (srwl->readerCount != 0) {
assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
SwitchToThread();
}
}
static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) {
assert(srwl->writerCount == 1 && srwl->readerCount >= 0);
srwl->writerCount = 0;
}
static uint64_t WINAPI stub_GetTickCount64(void) {
LARGE_INTEGER Counter, Frequency;
return (QueryPerformanceFrequency(&Frequency) &&
QueryPerformanceCounter(&Counter))
? Counter.QuadPart * 1000ul / Frequency.QuadPart
: 0;
}
/*----------------------------------------------------------------------------*/
#ifndef xMDBX_ALLOY
osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared,
osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive,
osal_srwlock_ReleaseExclusive;
MDBX_NtExtendSection mdbx_NtExtendSection;
MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
MDBX_NtFsControlFile mdbx_NtFsControlFile;
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
MDBX_GetTickCount64 mdbx_GetTickCount64;
MDBX_RegGetValueA mdbx_RegGetValueA;
MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* xMDBX_ALLOY */
#if __GNUC_PREREQ(8, 0)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wcast-function-type"
#endif /* GCC/MINGW */
static void mdbx_winnt_import(void) {
#define GET_PROC_ADDR(dll, ENTRY) \
mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY)
const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");
if (hNtdll) {
if (GetProcAddress(hNtdll, "wine_get_version")) {
assert(mdbx_RunningUnderWine());
} else {
GET_PROC_ADDR(hNtdll, NtFsControlFile);
GET_PROC_ADDR(hNtdll, NtExtendSection);
assert(!mdbx_RunningUnderWine());
}
}
const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll");
if (hKernel32dll) {
GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx);
GET_PROC_ADDR(hKernel32dll, GetTickCount64);
if (!mdbx_GetTickCount64)
mdbx_GetTickCount64 = stub_GetTickCount64;
if (!mdbx_RunningUnderWine()) {
GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle);
GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW);
GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW);
GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory);
GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange);
}
}
const osal_srwlock_t_function init =
(osal_srwlock_t_function)(hKernel32dll
? GetProcAddress(hKernel32dll,
"InitializeSRWLock")
: nullptr);
if (init != NULL) {
osal_srwlock_Init = init;
osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress(
hKernel32dll, "AcquireSRWLockShared");
osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress(
hKernel32dll, "ReleaseSRWLockShared");
osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress(
hKernel32dll, "AcquireSRWLockExclusive");
osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress(
hKernel32dll, "ReleaseSRWLockExclusive");
} else {
osal_srwlock_Init = stub_srwlock_Init;
osal_srwlock_AcquireShared = stub_srwlock_AcquireShared;
osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared;
osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive;
osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive;
}
const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll");
if (hAdvapi32dll) {
GET_PROC_ADDR(hAdvapi32dll, RegGetValueA);
}
#undef GET_PROC_ADDR
}
#if __GNUC_PREREQ(8, 0)
#pragma GCC diagnostic pop
#endif /* GCC/MINGW */
#endif /* Windows LCK-implementation */
#endif /* Windows */

193
src/lck.c Normal file
View File

@ -0,0 +1,193 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__cold static int lck_setup_locked(MDBX_env *env) {
int err = rthc_register(env);
if (unlikely(err != MDBX_SUCCESS))
return err;
int lck_seize_rc = lck_seize(env);
if (unlikely(MDBX_IS_ERROR(lck_seize_rc)))
return lck_seize_rc;
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
env->lck = lckless_stub(env);
env->max_readers = UINT_MAX;
DEBUG("lck-setup:%s%s%s", " lck-less",
(env->flags & MDBX_RDONLY) ? " readonly" : "",
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
return lck_seize_rc;
}
DEBUG("lck-setup:%s%s%s", " with-lck",
(env->flags & MDBX_RDONLY) ? " readonly" : "",
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
MDBX_env *inprocess_neighbor = nullptr;
err = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor);
if (unlikely(MDBX_IS_ERROR(err)))
return err;
if (inprocess_neighbor) {
if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 ||
(inprocess_neighbor->flags & MDBX_EXCLUSIVE) != 0)
return MDBX_BUSY;
if (lck_seize_rc == MDBX_RESULT_TRUE) {
err = lck_downgrade(env);
if (unlikely(err != MDBX_SUCCESS))
return err;
lck_seize_rc = MDBX_RESULT_FALSE;
}
}
uint64_t size = 0;
err = osal_filesize(env->lck_mmap.fd, &size);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (lck_seize_rc == MDBX_RESULT_TRUE) {
size =
ceil_powerof2(env->max_readers * sizeof(reader_slot_t) + sizeof(lck_t),
globals.sys_pagesize);
jitter4testing(false);
} else {
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_BUSY;
if (size > INT_MAX || (size & (globals.sys_pagesize - 1)) != 0 ||
size < globals.sys_pagesize) {
ERROR("lck-file has invalid size %" PRIu64 " bytes", size);
return MDBX_PROBLEM;
}
}
const size_t maxreaders =
((size_t)size - sizeof(lck_t)) / sizeof(reader_slot_t);
if (maxreaders < 4) {
ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders);
return MDBX_PROBLEM;
}
env->max_readers = (maxreaders <= MDBX_READERS_LIMIT)
? (unsigned)maxreaders
: (unsigned)MDBX_READERS_LIMIT;
err = osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap,
(size_t)size, (size_t)size,
lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE
: MMAP_OPTION_SEMAPHORE);
if (unlikely(err != MDBX_SUCCESS))
return err;
#if MDBX_ENABLE_MADVISE
#ifdef MADV_DODUMP
err = madvise(env->lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno)
: MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#endif /* MADV_DODUMP */
#ifdef MADV_WILLNEED
err = madvise(env->lck_mmap.lck, size, MADV_WILLNEED) ? ignore_enosys(errno)
: MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#elif defined(POSIX_MADV_WILLNEED)
err = ignore_enosys(
posix_madvise(env->lck_mmap.lck, size, POSIX_MADV_WILLNEED));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#endif /* MADV_WILLNEED */
#endif /* MDBX_ENABLE_MADVISE */
lck_t *lck = env->lck_mmap.lck;
if (lck_seize_rc == MDBX_RESULT_TRUE) {
/* If we succeed got exclusive lock, then nobody is using the lock region
* and we should initialize it. */
memset(lck, 0, (size_t)size);
jitter4testing(false);
lck->magic_and_version = MDBX_LOCK_MAGIC;
lck->os_and_format = MDBX_LOCK_FORMAT;
#if MDBX_ENABLE_PGOP_STAT
lck->pgops.wops.weak = 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
err = osal_msync(&env->lck_mmap, 0, (size_t)size,
MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
if (unlikely(err != MDBX_SUCCESS)) {
ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err);
eASSERT(env, MDBX_IS_ERROR(err));
return err;
}
} else {
if (lck->magic_and_version != MDBX_LOCK_MAGIC) {
const bool invalid = (lck->magic_and_version >> 8) != MDBX_MAGIC;
ERROR("lock region has %s",
invalid
? "invalid magic"
: "incompatible version (only applications with nearly or the "
"same versions of libmdbx can share the same database)");
return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH;
}
if (lck->os_and_format != MDBX_LOCK_FORMAT) {
ERROR("lock region has os/format signature 0x%" PRIx32
", expected 0x%" PRIx32,
lck->os_and_format, MDBX_LOCK_FORMAT);
return MDBX_VERSION_MISMATCH;
}
}
err = lck_init(env, inprocess_neighbor, lck_seize_rc);
if (unlikely(err != MDBX_SUCCESS)) {
eASSERT(env, MDBX_IS_ERROR(err));
return err;
}
env->lck = lck;
eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc));
return lck_seize_rc;
}
__cold int lck_setup(MDBX_env *env, mdbx_mode_t mode) {
eASSERT(env, env->lazy_fd != INVALID_HANDLE_VALUE);
eASSERT(env, env->lck_mmap.fd == INVALID_HANDLE_VALUE);
int err = osal_openfile(MDBX_OPEN_LCK, env, env->pathname.lck,
&env->lck_mmap.fd, mode);
if (err != MDBX_SUCCESS) {
switch (err) {
default:
return err;
case MDBX_ENOFILE:
case MDBX_EACCESS:
case MDBX_EPERM:
if (!F_ISSET(env->flags, MDBX_RDONLY | MDBX_EXCLUSIVE))
return err;
break;
case MDBX_EROFS:
if ((env->flags & MDBX_RDONLY) == 0)
return err;
break;
}
if (err != MDBX_ENOFILE) {
/* ENSURE the file system is read-only */
err = osal_check_fs_rdonly(env->lazy_fd, env->pathname.lck, err);
if (err != MDBX_SUCCESS &&
/* ignore ERROR_NOT_SUPPORTED for exclusive mode */
!(err == MDBX_ENOSYS && (env->flags & MDBX_EXCLUSIVE)))
return err;
}
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
}
rthc_lock();
err = lck_setup_locked(env);
rthc_unlock();
return err;
}
void mincore_clean_cache(const MDBX_env *const env) {
memset(env->lck->mincore_cache.begin, -1,
sizeof(env->lck->mincore_cache.begin));
}

112
src/lck.h Normal file
View File

@ -0,0 +1,112 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
MDBX_INTERNAL int lck_setup(MDBX_env *env, mdbx_mode_t mode);
#if MDBX_LOCKING > MDBX_LOCKING_SYSV
MDBX_INTERNAL int lck_ipclock_stubinit(osal_ipclock_t *ipc);
MDBX_INTERNAL int lck_ipclock_destroy(osal_ipclock_t *ipc);
#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */
/// \brief Initialization of synchronization primitives linked with MDBX_env
/// instance both in LCK-file and within the current process.
/// \param
/// global_uniqueness_flag = true - denotes that there are no other processes
/// working with DB and LCK-file. Thus the function MUST initialize
/// shared synchronization objects in memory-mapped LCK-file.
/// global_uniqueness_flag = false - denotes that at least one process is
/// already working with DB and LCK-file, including the case when DB
/// has already been opened in the current process. Thus the function
/// MUST NOT initialize shared synchronization objects in memory-mapped
/// LCK-file that are already in use.
/// \return Error code or zero on success.
MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor,
int global_uniqueness_flag);
/// \brief Disconnects from shared interprocess objects and destructs
/// synchronization objects linked with MDBX_env instance
/// within the current process.
/// \param
/// inprocess_neighbor = nullptr - if the current process does not have other
/// instances of MDBX_env linked with the DB being closed.
/// Thus the function MUST check for other processes working with DB or
/// LCK-file, and keep or destroy shared synchronization objects in
/// memory-mapped LCK-file depending on the result.
/// inprocess_neighbor = not-nullptr - pointer to another instance of MDBX_env
/// (anyone of there is several) working with DB or LCK-file within the
/// current process. Thus the function MUST NOT try to acquire exclusive
/// lock and/or try to destruct shared synchronization objects linked with
/// DB or LCK-file. Moreover, the implementation MUST ensure correct work
/// of other instances of MDBX_env within the current process, e.g.
/// restore POSIX-fcntl locks after the closing of file descriptors.
/// \return Error code (MDBX_PANIC) or zero on success.
MDBX_INTERNAL int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor,
const uint32_t current_pid);
/// \brief Connects to shared interprocess locking objects and tries to acquire
/// the maximum lock level (shared if exclusive is not available)
/// Depending on implementation or/and platform (Windows) this function may
/// acquire the non-OS super-level lock (e.g. for shared synchronization
/// objects initialization), which will be downgraded to OS-exclusive or
/// shared via explicit calling of lck_downgrade().
/// \return
/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
/// the current process is the first and only after the last use of DB.
/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
/// DB has already been opened and now is used by other processes.
/// Otherwise (not 0 and not -1) - error code.
MDBX_INTERNAL int lck_seize(MDBX_env *env);
/// \brief Downgrades the level of initially acquired lock to
/// operational level specified by argument. The reason for such downgrade:
/// - unblocking of other processes that are waiting for access, i.e.
/// if (env->flags & MDBX_EXCLUSIVE) != 0, then other processes
/// should be made aware that access is unavailable rather than
/// wait for it.
/// - freeing locks that interfere file operation (especially for Windows)
/// (env->flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
/// (env->flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
/// operational lock.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_downgrade(MDBX_env *env);
MDBX_MAYBE_UNUSED MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait);
/// \brief Locks LCK-file or/and table of readers for (de)registering.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env);
/// \brief Unlocks LCK-file or/and table of readers after (de)registering.
MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env);
/// \brief Acquires write-transaction lock.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_txn_lock(MDBX_env *env, bool dont_wait);
/// \brief Releases write-transaction lock..
MDBX_INTERNAL void lck_txn_unlock(MDBX_env *env);
/// \brief Sets alive-flag of reader presence (indicative lock) for PID of
/// the current process. The function does no more than needed for
/// the correct working of lck_rpid_check() in other processes.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_rpid_set(MDBX_env *env);
/// \brief Resets alive-flag of reader presence (indicative lock)
/// for PID of the current process. The function does no more than needed
/// for the correct working of lck_rpid_check() in other processes.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env);
/// \brief Checks for reading process status with the given pid with help of
/// alive-flag of presence (indicative lock) or using another way.
/// \return
/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
/// and working with DB (indicative lock is present).
/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
/// or not working with DB (indicative lock is not present).
/// Otherwise (not 0 and not -1) - error code.
MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid);

261
src/logging_and_debug.c Normal file
View File

@ -0,0 +1,261 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__cold void debug_log_va(int level, const char *function, int line,
const char *fmt, va_list args) {
ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0);
if (globals.logger.ptr) {
if (globals.logger_buffer == nullptr)
globals.logger.fmt(level, function, line, fmt, args);
else {
const int len = vsnprintf(globals.logger_buffer,
globals.logger_buffer_size, fmt, args);
if (len > 0)
globals.logger.nofmt(level, function, line, globals.logger_buffer, len);
}
} else {
#if defined(_WIN32) || defined(_WIN64)
if (IsDebuggerPresent()) {
int prefix_len = 0;
char *prefix = nullptr;
if (function && line > 0)
prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line);
else if (function)
prefix_len = osal_asprintf(&prefix, "%s: ", function);
else if (line > 0)
prefix_len = osal_asprintf(&prefix, "%d: ", line);
if (prefix_len > 0 && prefix) {
OutputDebugStringA(prefix);
osal_free(prefix);
}
char *msg = nullptr;
int msg_len = osal_vasprintf(&msg, fmt, args);
if (msg_len > 0 && msg) {
OutputDebugStringA(msg);
osal_free(msg);
}
}
#else
if (function && line > 0)
fprintf(stderr, "%s:%d ", function, line);
else if (function)
fprintf(stderr, "%s: ", function);
else if (line > 0)
fprintf(stderr, "%d: ", line);
vfprintf(stderr, fmt, args);
fflush(stderr);
#endif
}
ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0);
}
__cold void debug_log(int level, const char *function, int line,
const char *fmt, ...) {
va_list args;
va_start(args, fmt);
debug_log_va(level, function, line, fmt, args);
va_end(args);
}
/* Dump a val in ascii or hexadecimal. */
__cold const char *mdbx_dump_val(const MDBX_val *val, char *const buf,
const size_t bufsize) {
if (!val)
return "<null>";
if (!val->iov_len)
return "<empty>";
if (!buf || bufsize < 4)
return nullptr;
if (!val->iov_base) {
int len = snprintf(buf, bufsize, "<nullptr.%zu>", val->iov_len);
assert(len > 0 && (size_t)len < bufsize);
(void)len;
return buf;
}
bool is_ascii = true;
const uint8_t *const data = val->iov_base;
for (size_t i = 0; i < val->iov_len; i++)
if (data[i] < ' ' || data[i] > '~') {
is_ascii = false;
break;
}
if (is_ascii) {
int len =
snprintf(buf, bufsize, "%.*s",
(val->iov_len > INT_MAX) ? INT_MAX : (int)val->iov_len, data);
assert(len > 0 && (size_t)len < bufsize);
(void)len;
} else {
char *const detent = buf + bufsize - 2;
char *ptr = buf;
*ptr++ = '<';
for (size_t i = 0; i < val->iov_len && ptr < detent; i++) {
const char hex[16] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
*ptr++ = hex[data[i] >> 4];
*ptr++ = hex[data[i] & 15];
}
if (ptr < detent)
*ptr++ = '>';
*ptr = '\0';
}
return buf;
}
/*------------------------------------------------------------------------------
LY: debug stuff */
__cold const char *pagetype_caption(const uint8_t type, char buf4unknown[16]) {
switch (type) {
case P_BRANCH:
return "branch";
case P_LEAF:
return "leaf";
case P_LEAF | P_SUBP:
return "subleaf";
case P_LEAF | P_DUPFIX:
return "dupfix-leaf";
case P_LEAF | P_DUPFIX | P_SUBP:
return "dupfix-subleaf";
case P_LEAF | P_DUPFIX | P_SUBP | P_LEGACY_DIRTY:
return "dupfix-subleaf.legacy-dirty";
case P_LARGE:
return "large";
default:
snprintf(buf4unknown, 16, "unknown_0x%x", type);
return buf4unknown;
}
}
__cold static const char *leafnode_type(node_t *n) {
static const char *const tp[2][2] = {{"", ": DB"},
{": sub-page", ": sub-DB"}};
return (node_flags(n) & N_BIGDATA)
? ": large page"
: tp[!!(node_flags(n) & N_DUPDATA)][!!(node_flags(n) & N_SUBDATA)];
}
/* Display all the keys in the page. */
__cold void page_list(page_t *mp) {
pgno_t pgno = mp->pgno;
const char *type;
node_t *node;
size_t i, nkeys, nsize, total = 0;
MDBX_val key;
DKBUF;
switch (page_type(mp)) {
case P_BRANCH:
type = "Branch page";
break;
case P_LEAF:
type = "Leaf page";
break;
case P_LEAF | P_SUBP:
type = "Leaf sub-page";
break;
case P_LEAF | P_DUPFIX:
type = "Leaf2 page";
break;
case P_LEAF | P_DUPFIX | P_SUBP:
type = "Leaf2 sub-page";
break;
case P_LARGE:
VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->pages);
return;
case P_META:
VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno,
unaligned_peek_u64(4, page_meta(mp)->txnid_a));
return;
default:
VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->flags);
return;
}
nkeys = page_numkeys(mp);
VERBOSE("%s %" PRIaPGNO " numkeys %zu\n", type, pgno, nkeys);
for (i = 0; i < nkeys; i++) {
if (is_dupfix_leaf(
mp)) { /* DUPFIX pages have no entries[] or node headers */
key = page_dupfix_key(mp, i, nsize = mp->dupfix_ksize);
total += nsize;
VERBOSE("key %zu: nsize %zu, %s\n", i, nsize, DKEY(&key));
continue;
}
node = page_node(mp, i);
key.iov_len = node_ks(node);
key.iov_base = node->payload;
nsize = NODESIZE + key.iov_len;
if (is_branch(mp)) {
VERBOSE("key %zu: page %" PRIaPGNO ", %s\n", i, node_pgno(node),
DKEY(&key));
total += nsize;
} else {
if (node_flags(node) & N_BIGDATA)
nsize += sizeof(pgno_t);
else
nsize += node_ds(node);
total += nsize;
nsize += sizeof(indx_t);
VERBOSE("key %zu: nsize %zu, %s%s\n", i, nsize, DKEY(&key),
leafnode_type(node));
}
total = EVEN_CEIL(total);
}
VERBOSE("Total: header %u + contents %zu + unused %zu\n",
is_dupfix_leaf(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->lower, total,
page_room(mp));
}
__cold static int setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags,
union logger_union logger, char *buffer,
size_t buffer_size) {
ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0);
const int rc = globals.runtime_flags | (globals.loglevel << 16);
if (level != MDBX_LOG_DONTCHANGE)
globals.loglevel = (uint8_t)level;
if (flags != MDBX_DBG_DONTCHANGE) {
flags &=
#if MDBX_DEBUG
MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER |
#endif
MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP |
MDBX_DBG_DONT_UPGRADE;
globals.runtime_flags = (uint8_t)flags;
}
assert(MDBX_LOGGER_DONTCHANGE == ((MDBX_debug_func *)(intptr_t)-1));
if (logger.ptr != (void *)((intptr_t)-1)) {
globals.logger.ptr = logger.ptr;
globals.logger_buffer = buffer;
globals.logger_buffer_size = buffer_size;
}
ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0);
return rc;
}
__cold int mdbx_setup_debug_nofmt(MDBX_log_level_t level,
MDBX_debug_flags_t flags,
MDBX_debug_func_nofmt *logger, char *buffer,
size_t buffer_size) {
union logger_union thunk;
thunk.nofmt =
(logger && buffer && buffer_size) ? logger : MDBX_LOGGER_NOFMT_DONTCHANGE;
return setup_debug(level, flags, thunk, buffer, buffer_size);
}
__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags,
MDBX_debug_func *logger) {
union logger_union thunk;
thunk.fmt = logger;
return setup_debug(level, flags, thunk, nullptr, 0);
}

160
src/logging_and_debug.h Normal file
View File

@ -0,0 +1,160 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
#ifndef __Wpedantic_format_voidptr
MDBX_MAYBE_UNUSED static inline const void *
__Wpedantic_format_voidptr(const void *ptr) {
return ptr;
}
#define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
#endif /* __Wpedantic_format_voidptr */
MDBX_INTERNAL void MDBX_PRINTF_ARGS(4, 5)
debug_log(int level, const char *function, int line, const char *fmt, ...)
MDBX_PRINTF_ARGS(4, 5);
MDBX_INTERNAL void debug_log_va(int level, const char *function, int line,
const char *fmt, va_list args);
#if MDBX_DEBUG
#define LOG_ENABLED(LVL) unlikely(LVL <= globals.loglevel)
#define AUDIT_ENABLED() \
unlikely((globals.runtime_flags & (unsigned)MDBX_DBG_AUDIT))
#else /* MDBX_DEBUG */
#define LOG_ENABLED(LVL) (LVL < MDBX_LOG_VERBOSE && LVL <= globals.loglevel)
#define AUDIT_ENABLED() (0)
#endif /* LOG_ENABLED() & AUDIT_ENABLED() */
#if MDBX_FORCE_ASSERTIONS
#define ASSERT_ENABLED() (1)
#elif MDBX_DEBUG
#define ASSERT_ENABLED() \
likely((globals.runtime_flags & (unsigned)MDBX_DBG_ASSERT))
#else
#define ASSERT_ENABLED() (0)
#endif /* ASSERT_ENABLED() */
#define DEBUG_EXTRA(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_EXTRA)) \
debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \
} while (0)
#define DEBUG_EXTRA_PRINT(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_EXTRA)) \
debug_log(MDBX_LOG_EXTRA, nullptr, 0, fmt, __VA_ARGS__); \
} while (0)
#define TRACE(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_TRACE)) \
debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define DEBUG(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_DEBUG)) \
debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define VERBOSE(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \
debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define NOTICE(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_NOTICE)) \
debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define WARNING(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_WARN)) \
debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#undef ERROR /* wingdi.h \
Yeah, morons from M$ put such definition to the public header. */
#define ERROR(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_ERROR)) \
debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define FATAL(fmt, ...) \
debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
/* assert(3) variant in environment context */
#define eASSERT(env, expr) \
do { \
if (ASSERT_ENABLED()) \
ENSURE(env, expr); \
} while (0)
/* assert(3) variant in cursor context */
#define cASSERT(mc, expr) eASSERT((mc)->txn->env, expr)
/* assert(3) variant in transaction context */
#define tASSERT(txn, expr) eASSERT((txn)->env, expr)
#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */
#undef assert
#define assert(expr) eASSERT(nullptr, expr)
#endif
MDBX_MAYBE_UNUSED static inline void jitter4testing(bool tiny) {
#if MDBX_DEBUG
if (globals.runtime_flags & (unsigned)MDBX_DBG_JITTER)
osal_jitter(tiny);
#else
(void)tiny;
#endif
}
MDBX_MAYBE_UNUSED MDBX_INTERNAL void page_list(page_t *mp);
MDBX_INTERNAL const char *pagetype_caption(const uint8_t type,
char buf4unknown[16]);
/* Key size which fits in a DKBUF (debug key buffer). */
#define DKBUF_MAX 127
#define DKBUF char dbg_kbuf[DKBUF_MAX * 4 + 2]
#define DKEY(x) mdbx_dump_val(x, dbg_kbuf, DKBUF_MAX * 2 + 1)
#define DVAL(x) \
mdbx_dump_val(x, dbg_kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1)
#if MDBX_DEBUG
#define DKBUF_DEBUG DKBUF
#define DKEY_DEBUG(x) DKEY(x)
#define DVAL_DEBUG(x) DVAL(x)
#else
#define DKBUF_DEBUG ((void)(0))
#define DKEY_DEBUG(x) ("-")
#define DVAL_DEBUG(x) ("-")
#endif

View File

@ -1,18 +1,14 @@
//
// Copyright (c) 2020-2024, Leonid Yuriev <leo@yuriev.ru>.
// SPDX-License-Identifier: Apache-2.0
//
// Non-inline part of the libmdbx C++ API
//
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2020-2024
///
/// \brief Non-inline part of the libmdbx C++ API
///
#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS
#endif /* _CRT_SECURE_NO_WARNINGS */
#include "essentials.h"
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#if !defined(MDBX_BUILD_CXX) || MDBX_BUILD_CXX != 1
#error "Build is misconfigured! Expecting MDBX_BUILD_CXX=1 for C++ API."
#endif /* MDBX_BUILD_CXX*/
/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */
#if defined(_MSC_VER) && defined(__SANITIZE_ADDRESS__) && \
@ -22,8 +18,6 @@
#include "../mdbx.h++"
#include "internals.h"
#include <array>
#include <atomic>
#include <cctype> // for isxdigit(), etc
@ -402,6 +396,7 @@ __cold void error::throw_exception() const {
CASE_EXCEPTION(incompatible_operation, MDBX_INCOMPATIBLE);
CASE_EXCEPTION(internal_page_full, MDBX_PAGE_FULL);
CASE_EXCEPTION(internal_problem, MDBX_PROBLEM);
CASE_EXCEPTION(key_exists, MDBX_KEYEXIST);
CASE_EXCEPTION(key_mismatch, MDBX_EKEYMISMATCH);
CASE_EXCEPTION(max_maps_reached, MDBX_DBS_FULL);
CASE_EXCEPTION(max_readers_reached, MDBX_READERS_FULL);
@ -1227,7 +1222,7 @@ env::operate_parameters::make_flags(bool accede, bool use_subdirectory) const {
if (options.nested_write_transactions)
flags &= ~MDBX_WRITEMAP;
if (reclaiming.coalesce)
flags |= MDBX_env_flags_t(MDBX_DEPRECATED_COALESCE);
flags |= MDBX_COALESCE;
if (reclaiming.lifo)
flags |= MDBX_LIFORECLAIM;
switch (durability) {
@ -1272,7 +1267,7 @@ env::durability env::operate_parameters::durability_from_flags(
env::reclaiming_options::reclaiming_options(MDBX_env_flags_t flags) noexcept
: lifo((flags & MDBX_LIFORECLAIM) ? true : false),
coalesce((flags & MDBX_DEPRECATED_COALESCE) ? true : false) {}
coalesce((flags & MDBX_COALESCE) ? true : false) {}
env::operate_options::operate_options(MDBX_env_flags_t flags) noexcept
: no_sticky_threads(((flags & (MDBX_NOSTICKYTHREADS | MDBX_EXCLUSIVE)) ==
@ -1742,21 +1737,20 @@ __cold ::std::ostream &operator<<(::std::ostream &out,
const char *suffix;
} static const scales[] = {
#if MDBX_WORDBITS > 32
{env_managed::geometry::EiB, "EiB"},
{env_managed::geometry::EB, "EB"},
{env_managed::geometry::PiB, "PiB"},
{env_managed::geometry::PB, "PB"},
{env_managed::geometry::TiB, "TiB"},
{env_managed::geometry::TB, "TB"},
{env_managed::geometry::EiB, "EiB"},
{env_managed::geometry::EB, "EB"},
{env_managed::geometry::PiB, "PiB"},
{env_managed::geometry::PB, "PB"},
{env_managed::geometry::TiB, "TiB"},
{env_managed::geometry::TB, "TB"},
#endif
{env_managed::geometry::GiB, "GiB"},
{env_managed::geometry::GB, "GB"},
{env_managed::geometry::MiB, "MiB"},
{env_managed::geometry::MB, "MB"},
{env_managed::geometry::KiB, "KiB"},
{env_managed::geometry::kB, "kB"},
{1, " bytes"}
};
{env_managed::geometry::GiB, "GiB"},
{env_managed::geometry::GB, "GB"},
{env_managed::geometry::MiB, "MiB"},
{env_managed::geometry::MB, "MB"},
{env_managed::geometry::KiB, "KiB"},
{env_managed::geometry::kB, "kB"},
{1, " bytes"}};
for (const auto i : scales)
if (bytes % i.one == 0)

746
src/meta.c Normal file
View File

@ -0,0 +1,746 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
typedef struct meta_snap {
uint64_t txnid;
size_t is_steady;
} meta_snap_t;
static inline txnid_t fetch_txnid(const volatile mdbx_atomic_uint32_t *ptr) {
#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \
MDBX_UNALIGNED_OK >= 8
return atomic_load64((const volatile mdbx_atomic_uint64_t *)ptr,
mo_AcquireRelease);
#else
const uint32_t l = atomic_load32(
&ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
const uint32_t h = atomic_load32(
&ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
return (uint64_t)h << 32 | l;
#endif
}
static inline meta_snap_t meta_snap(const volatile meta_t *meta) {
txnid_t txnid = fetch_txnid(meta->txnid_a);
jitter4testing(true);
size_t is_steady = meta_is_steady(meta) && txnid >= MIN_TXNID;
jitter4testing(true);
if (unlikely(txnid != fetch_txnid(meta->txnid_b)))
txnid = is_steady = 0;
meta_snap_t r = {txnid, is_steady};
return r;
}
txnid_t meta_txnid(const volatile meta_t *meta) {
return meta_snap(meta).txnid;
}
meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) {
eASSERT(env, n < NUM_METAS);
meta_ptr_t r;
meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n));
r.txnid = snap.txnid;
r.is_steady = snap.is_steady;
return r;
}
static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, uint8_t c12, bool s0,
bool s1, bool s2) {
assert(c01 < 3 && c02 < 3 && c12 < 3);
/* assert(s0 < 2 && s1 < 2 && s2 < 2); */
const uint8_t recent = meta_cmp2recent(c01, s0, s1)
? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
: (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1)
? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
: (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
uint8_t tail;
if (recent == 0)
tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
else if (recent == 1)
tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
else
tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
const bool valid =
c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
(c12 != 1 || s1 != s2);
return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7;
}
static inline void meta_troika_unpack(troika_t *troika, const uint8_t packed) {
troika->recent = (packed >> 2) & 3;
troika->prefer_steady = (packed >> 4) & 3;
troika->tail_and_flags = packed & 0xC3;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
troika->unused_pad = 0;
#endif
}
static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = {
232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232,
168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169,
232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233,
169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152,
168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212,
214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137,
216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40,
129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169,
168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168,
168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228,
212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233,
233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150,
164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214,
214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194,
210, 194, 225, 193, 210, 194};
__cold bool troika_verify_fsm(void) {
bool ok = true;
for (size_t i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) {
const bool s0 = (i >> 0) & 1;
const bool s1 = (i >> 1) & 1;
const bool s2 = (i >> 2) & 1;
const uint8_t c01 = (i / (8 * 1)) % 3;
const uint8_t c02 = (i / (8 * 3)) % 3;
const uint8_t c12 = (i / (8 * 9)) % 3;
const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2);
troika_t troika;
troika.fsm = (uint8_t)i;
meta_troika_unpack(&troika, packed);
const uint8_t tail = TROIKA_TAIL(&troika);
const bool strict = TROIKA_STRICT_VALID(&troika);
const bool valid = TROIKA_VALID(&troika);
const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1)
? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
: (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
const uint8_t prefer_steady_chk =
meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
: (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
uint8_t tail_chk;
if (recent_chk == 0)
tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
else if (recent_chk == 1)
tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
else
tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
const bool valid_chk =
c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
(c12 != 1 || s1 != s2);
assert(troika.recent == recent_chk);
assert(troika.prefer_steady == prefer_steady_chk);
assert(tail == tail_chk);
assert(valid == valid_chk);
assert(strict == strict_chk);
assert(troika_fsm_map[troika.fsm] == packed);
if (troika.recent != recent_chk ||
troika.prefer_steady != prefer_steady_chk || tail != tail_chk ||
valid != valid_chk || strict != strict_chk ||
troika_fsm_map[troika.fsm] != packed) {
ok = false;
}
}
return ok;
}
__hot troika_t meta_tap(const MDBX_env *env) {
meta_snap_t snap;
troika_t troika;
snap = meta_snap(METAPAGE(env, 0));
troika.txnid[0] = snap.txnid;
troika.fsm = (uint8_t)snap.is_steady << 0;
snap = meta_snap(METAPAGE(env, 1));
troika.txnid[1] = snap.txnid;
troika.fsm += (uint8_t)snap.is_steady << 1;
troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8);
snap = meta_snap(METAPAGE(env, 2));
troika.txnid[2] = snap.txnid;
troika.fsm += (uint8_t)snap.is_steady << 2;
troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3);
troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3);
meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]);
return troika;
}
txnid_t recent_committed_txnid(const MDBX_env *env) {
const txnid_t m0 = meta_txnid(METAPAGE(env, 0));
const txnid_t m1 = meta_txnid(METAPAGE(env, 1));
const txnid_t m2 = meta_txnid(METAPAGE(env, 2));
return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2);
}
static inline bool meta_eq(const troika_t *troika, size_t a, size_t b) {
assert(a < NUM_METAS && b < NUM_METAS);
return troika->txnid[a] == troika->txnid[b] &&
(((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 &&
troika->txnid[a];
}
unsigned meta_eq_mask(const troika_t *troika) {
return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 |
meta_eq(troika, 2, 0) << 2;
}
__hot bool meta_should_retry(const MDBX_env *env, troika_t *troika) {
const troika_t prev = *troika;
*troika = meta_tap(env);
return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] ||
prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2];
}
const char *durable_caption(const meta_t *const meta) {
if (meta_is_steady(meta))
return (meta_sign_get(meta) == meta_sign_calculate(meta)) ? "Steady"
: "Tainted";
return "Weak";
}
__cold void meta_troika_dump(const MDBX_env *env, const troika_t *troika) {
const meta_ptr_t recent = meta_recent(env, troika);
const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika);
const meta_ptr_t tail = meta_tail(env, troika);
NOTICE("troika: %" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, "
"head=%d-%" PRIaTXN ".%c, "
"base=%d-%" PRIaTXN ".%c, "
"tail=%d-%" PRIaTXN ".%c, "
"valid %c, strict %c",
troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1],
(troika->fsm & 2) ? 's' : 'w', troika->txnid[2],
(troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent,
recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady,
prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w',
troika->tail_and_flags % NUM_METAS, tail.txnid,
tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N',
TROIKA_STRICT_VALID(troika) ? 'Y' : 'N');
}
/*----------------------------------------------------------------------------*/
static int meta_unsteady(MDBX_env *env, const txnid_t inclusive_upto,
const pgno_t pgno) {
meta_t *const meta = METAPAGE(env, pgno);
const txnid_t txnid = constmeta_txnid(meta);
if (!meta_is_steady(meta) || txnid > inclusive_upto)
return MDBX_RESULT_FALSE;
WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno);
const uint64_t wipe = DATASIGN_NONE;
const void *ptr = &wipe;
size_t bytes = sizeof(meta->sign),
offset = ptr_dist(&meta->sign, env->dxb_mmap.base);
if (env->flags & MDBX_WRITEMAP) {
unaligned_poke_u64(4, meta->sign, wipe);
osal_flush_incoherent_cpu_writeback();
if (!MDBX_AVOID_MSYNC)
return MDBX_RESULT_TRUE;
ptr = data_page(meta);
offset = ptr_dist(ptr, env->dxb_mmap.base);
bytes = env->ps;
}
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
int err = osal_pwrite(env->fd4meta, ptr, bytes, offset);
return likely(err == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : err;
}
__cold int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto) {
int err = meta_unsteady(env, inclusive_upto, 0);
if (likely(!MDBX_IS_ERROR(err)))
err = meta_unsteady(env, inclusive_upto, 1);
if (likely(!MDBX_IS_ERROR(err)))
err = meta_unsteady(env, inclusive_upto, 2);
if (err == MDBX_RESULT_TRUE) {
err = MDBX_SUCCESS;
if (!MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) {
err = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
} else if (env->fd4meta == env->lazy_fd) {
err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
}
osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS),
globals.sys_pagesize);
/* force oldest refresh */
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
env->basal_txn->tw.troika = meta_tap(env);
for (MDBX_txn *scan = env->basal_txn->nested; scan; scan = scan->nested)
scan->tw.troika = env->basal_txn->tw.troika;
return err;
}
int meta_sync(const MDBX_env *env, const meta_ptr_t head) {
eASSERT(env, atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) !=
(uint32_t)head.txnid);
/* Функция может вызываться (в том числе) при (env->flags &
* MDBX_NOMETASYNC) == 0 и env->fd4meta == env->dsync_fd, например если
* предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */
int rc = MDBX_RESULT_TRUE;
if (env->flags & MDBX_WRITEMAP) {
if (!MDBX_AVOID_MSYNC) {
rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
} else {
#if MDBX_ENABLE_PGOP_ST
env->lck->pgops.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
const page_t *page = data_page(head.ptr_c);
rc = osal_pwrite(env->fd4meta, page, env->ps,
ptr_dist(page, env->dxb_mmap.base));
if (likely(rc == MDBX_SUCCESS) && env->fd4meta == env->lazy_fd) {
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
}
} else {
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
if (likely(rc == MDBX_SUCCESS))
env->lck->meta_sync_txnid.weak = (uint32_t)head.txnid;
return rc;
}
__cold static page_t *meta_model(const MDBX_env *env, page_t *model,
size_t num) {
ENSURE(env, is_powerof2(env->ps));
ENSURE(env, env->ps >= MDBX_MIN_PAGESIZE);
ENSURE(env, env->ps <= MDBX_MAX_PAGESIZE);
ENSURE(env, env->geo_in_bytes.lower >= MIN_MAPSIZE);
ENSURE(env, env->geo_in_bytes.upper <= MAX_MAPSIZE);
ENSURE(env, env->geo_in_bytes.now >= env->geo_in_bytes.lower);
ENSURE(env, env->geo_in_bytes.now <= env->geo_in_bytes.upper);
memset(model, 0, env->ps);
model->pgno = (pgno_t)num;
model->flags = P_META;
meta_t *const model_meta = page_meta(model);
unaligned_poke_u64(4, model_meta->magic_and_version, MDBX_DATA_MAGIC);
model_meta->geometry.lower = bytes2pgno(env, env->geo_in_bytes.lower);
model_meta->geometry.upper = bytes2pgno(env, env->geo_in_bytes.upper);
model_meta->geometry.grow_pv =
pages2pv(bytes2pgno(env, env->geo_in_bytes.grow));
model_meta->geometry.shrink_pv =
pages2pv(bytes2pgno(env, env->geo_in_bytes.shrink));
model_meta->geometry.now = bytes2pgno(env, env->geo_in_bytes.now);
model_meta->geometry.first_unallocated = NUM_METAS;
ENSURE(env, model_meta->geometry.lower >= MIN_PAGENO);
ENSURE(env, model_meta->geometry.upper <= MAX_PAGENO + 1);
ENSURE(env, model_meta->geometry.now >= model_meta->geometry.lower);
ENSURE(env, model_meta->geometry.now <= model_meta->geometry.upper);
ENSURE(env, model_meta->geometry.first_unallocated >= MIN_PAGENO);
ENSURE(env,
model_meta->geometry.first_unallocated <= model_meta->geometry.now);
ENSURE(env, model_meta->geometry.grow_pv ==
pages2pv(pv2pages(model_meta->geometry.grow_pv)));
ENSURE(env, model_meta->geometry.shrink_pv ==
pages2pv(pv2pages(model_meta->geometry.shrink_pv)));
model_meta->pagesize = env->ps;
model_meta->trees.gc.flags = MDBX_INTEGERKEY;
model_meta->trees.gc.root = P_INVALID;
model_meta->trees.main.root = P_INVALID;
meta_set_txnid(env, model_meta, MIN_TXNID + num);
unaligned_poke_u64(4, model_meta->sign, meta_sign_calculate(model_meta));
eASSERT(env, coherency_check_meta(env, model_meta, true));
return ptr_disp(model, env->ps);
}
__cold meta_t *meta_init_triplet(const MDBX_env *env, void *buffer) {
page_t *page0 = (page_t *)buffer;
page_t *page1 = meta_model(env, page0, 0);
page_t *page2 = meta_model(env, page1, 1);
meta_model(env, page2, 2);
return page_meta(page2);
}
__cold int __must_check_result meta_override(MDBX_env *env, size_t target,
txnid_t txnid,
const meta_t *shape) {
int rc = env_page_auxbuffer(env);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
page_t *const page = env->page_auxbuf;
meta_model(env, page, target);
meta_t *const model = page_meta(page);
meta_set_txnid(env, model, txnid);
if (txnid)
eASSERT(env, coherency_check_meta(env, model, true));
if (shape) {
if (txnid && unlikely(!coherency_check_meta(env, shape, false))) {
ERROR("bailout overriding meta-%zu since model failed "
"FreeDB/MainDB %s-check for txnid #%" PRIaTXN,
target, "pre", constmeta_txnid(shape));
return MDBX_PROBLEM;
}
if (globals.runtime_flags & MDBX_DBG_DONT_UPGRADE)
memcpy(&model->magic_and_version, &shape->magic_and_version,
sizeof(model->magic_and_version));
model->reserve16 = shape->reserve16;
model->validator_id = shape->validator_id;
model->extra_pagehdr = shape->extra_pagehdr;
memcpy(&model->geometry, &shape->geometry, sizeof(model->geometry));
memcpy(&model->trees, &shape->trees, sizeof(model->trees));
memcpy(&model->canary, &shape->canary, sizeof(model->canary));
memcpy(&model->pages_retired, &shape->pages_retired,
sizeof(model->pages_retired));
if (txnid) {
if ((!model->trees.gc.mod_txnid && model->trees.gc.root != P_INVALID) ||
(!model->trees.main.mod_txnid && model->trees.main.root != P_INVALID))
memcpy(&model->magic_and_version, &shape->magic_and_version,
sizeof(model->magic_and_version));
if (unlikely(!coherency_check_meta(env, model, false))) {
ERROR("bailout overriding meta-%zu since model failed "
"FreeDB/MainDB %s-check for txnid #%" PRIaTXN,
target, "post", txnid);
return MDBX_PROBLEM;
}
}
}
meta_sign_as_steady(model);
rc = meta_validate(env, model, page, (pgno_t)target, nullptr);
if (unlikely(MDBX_IS_ERROR(rc)))
return MDBX_PROBLEM;
if (shape && memcmp(model, shape, sizeof(meta_t)) == 0) {
NOTICE("skip overriding meta-%zu since no changes "
"for txnid #%" PRIaTXN,
target, txnid);
return MDBX_SUCCESS;
}
if (env->flags & MDBX_WRITEMAP) {
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_msync(&env->dxb_mmap, 0,
pgno_align2os_bytes(env, model->geometry.first_unallocated),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
/* meta_override() called only while current process have exclusive
* lock of a DB file. So meta-page could be updated directly without
* clearing consistency flag by mdbx_meta_update_begin() */
memcpy(pgno2page(env, target), page, env->ps);
osal_flush_incoherent_cpu_writeback();
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, target + 1),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
} else {
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_pwrite(env->fd4meta, page, env->ps, pgno2bytes(env, target));
if (rc == MDBX_SUCCESS && env->fd4meta == env->lazy_fd) {
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
}
osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS),
globals.sys_pagesize);
}
eASSERT(env, (!env->txn && !env->basal_txn) ||
(env->stuck_meta == (int)target &&
(env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) ==
MDBX_EXCLUSIVE));
return rc;
}
__cold int meta_validate(MDBX_env *env, meta_t *const meta,
const page_t *const page, const unsigned meta_number,
unsigned *guess_pagesize) {
const uint64_t magic_and_version =
unaligned_peek_u64(4, &meta->magic_and_version);
if (unlikely(magic_and_version != MDBX_DATA_MAGIC &&
magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT &&
magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) {
ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number,
magic_and_version);
return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID
: MDBX_VERSION_MISMATCH;
}
if (unlikely(page->pgno != meta_number)) {
ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->pgno);
return MDBX_INVALID;
}
if (unlikely(page->flags != P_META)) {
ERROR("page #%u not a meta-page", meta_number);
return MDBX_INVALID;
}
if (unlikely(!is_powerof2(meta->pagesize) ||
meta->pagesize < MDBX_MIN_PAGESIZE ||
meta->pagesize > MDBX_MAX_PAGESIZE)) {
WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number,
meta->pagesize);
return is_powerof2(meta->pagesize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID;
}
if (guess_pagesize && *guess_pagesize != meta->pagesize) {
*guess_pagesize = meta->pagesize;
VERBOSE("meta[%u] took pagesize %u", meta_number, meta->pagesize);
}
const txnid_t txnid = unaligned_peek_u64(4, &meta->txnid_a);
if (unlikely(txnid != unaligned_peek_u64(4, &meta->txnid_b))) {
WARNING("meta[%u] not completely updated, skip it", meta_number);
return MDBX_RESULT_TRUE;
}
/* LY: check signature as a checksum */
const uint64_t sign = meta_sign_get(meta);
const uint64_t sign_stready = meta_sign_calculate(meta);
if (SIGN_IS_STEADY(sign) && unlikely(sign != sign_stready)) {
WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64
"), skip it",
meta_number, sign, sign_stready);
return MDBX_RESULT_TRUE;
}
if (unlikely(meta->trees.gc.flags != MDBX_INTEGERKEY)) {
WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number,
"GC/FreeDB", meta->trees.gc.flags);
return MDBX_INCOMPATIBLE;
}
if (unlikely(!check_sdb_flags(meta->trees.main.flags))) {
WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number,
"MainDB", meta->trees.main.flags);
return MDBX_INCOMPATIBLE;
}
DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO
", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
" +%u -%u, txn_id %" PRIaTXN ", %s",
page->pgno, meta->trees.main.root, meta->trees.gc.root,
meta->geometry.lower, meta->geometry.first_unallocated,
meta->geometry.now, meta->geometry.upper,
pv2pages(meta->geometry.grow_pv), pv2pages(meta->geometry.shrink_pv),
txnid, durable_caption(meta));
if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) {
WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number,
txnid);
return MDBX_RESULT_TRUE;
}
if (unlikely(meta->geometry.lower < MIN_PAGENO ||
meta->geometry.lower > MAX_PAGENO + 1)) {
WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.lower);
return MDBX_INVALID;
}
if (unlikely(meta->geometry.upper < MIN_PAGENO ||
meta->geometry.upper > MAX_PAGENO + 1 ||
meta->geometry.upper < meta->geometry.lower)) {
WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.upper);
return MDBX_INVALID;
}
if (unlikely(meta->geometry.first_unallocated < MIN_PAGENO ||
meta->geometry.first_unallocated - 1 > MAX_PAGENO)) {
WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.first_unallocated);
return MDBX_CORRUPTED;
}
const uint64_t used_bytes =
meta->geometry.first_unallocated * (uint64_t)meta->pagesize;
if (unlikely(used_bytes > env->dxb_mmap.filesize)) {
/* Here could be a race with DB-shrinking performed by other process */
int err = osal_filesize(env->lazy_fd, &env->dxb_mmap.filesize);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (unlikely(used_bytes > env->dxb_mmap.filesize)) {
WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64
"), skip it",
meta_number, used_bytes, env->dxb_mmap.filesize);
return MDBX_CORRUPTED;
}
}
if (unlikely(meta->geometry.first_unallocated - 1 > MAX_PAGENO ||
used_bytes > MAX_MAPSIZE)) {
WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it",
meta_number, used_bytes);
return MDBX_TOO_LARGE;
}
pgno_t geo_lower = meta->geometry.lower;
uint64_t mapsize_min = geo_lower * (uint64_t)meta->pagesize;
STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MDBX_MAX_PAGESIZE);
STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
STATIC_ASSERT((uint64_t)(MAX_PAGENO + 1) * MDBX_MIN_PAGESIZE % (4ul << 20) ==
0);
if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) {
if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE &&
mapsize_min <= MAX_MAPSIZE64) {
eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO &&
used_bytes <= MAX_MAPSIZE);
WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), "
"but size of used space still acceptable (%" PRIu64 ")",
meta_number, mapsize_min, used_bytes);
geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->pagesize);
if (geo_lower > MAX_PAGENO + 1) {
geo_lower = MAX_PAGENO + 1;
mapsize_min = geo_lower * (uint64_t)meta->pagesize;
}
WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
" instead of wrong %" PRIaPGNO
", will be corrected on next commit(s)",
meta_number, "lower", geo_lower, meta->geometry.lower);
meta->geometry.lower = geo_lower;
} else {
WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it",
meta_number, mapsize_min);
return MDBX_VERSION_MISMATCH;
}
}
pgno_t geo_upper = meta->geometry.upper;
uint64_t mapsize_max = geo_upper * (uint64_t)meta->pagesize;
STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
if (unlikely(mapsize_max > MAX_MAPSIZE ||
(MAX_PAGENO + 1) <
ceil_powerof2((size_t)mapsize_max, globals.sys_pagesize) /
(size_t)meta->pagesize)) {
if (mapsize_max > MAX_MAPSIZE64) {
WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it",
meta_number, mapsize_max);
return MDBX_VERSION_MISMATCH;
}
/* allow to open large DB from a 32-bit environment */
eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO &&
used_bytes <= MAX_MAPSIZE);
WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), "
"but size of used space still acceptable (%" PRIu64 ")",
meta_number, mapsize_max, used_bytes);
geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->pagesize);
if (geo_upper > MAX_PAGENO + 1) {
geo_upper = MAX_PAGENO + 1;
mapsize_max = geo_upper * (uint64_t)meta->pagesize;
}
WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
" instead of wrong %" PRIaPGNO
", will be corrected on next commit(s)",
meta_number, "upper", geo_upper, meta->geometry.upper);
meta->geometry.upper = geo_upper;
}
/* LY: check and silently put geometry.now into [geo.lower...geo.upper].
*
* Copy-with-compaction by old version of libmdbx could produce DB-file
* less than meta.geo.lower bound, in case actual filling is low or no data
* at all. This is not a problem as there is no damage or loss of data.
* Therefore it is better not to consider such situation as an error, but
* silently correct it. */
pgno_t geo_now = meta->geometry.now;
if (geo_now < geo_lower)
geo_now = geo_lower;
if (geo_now > geo_upper && meta->geometry.first_unallocated <= geo_upper)
geo_now = geo_upper;
if (unlikely(meta->geometry.first_unallocated > geo_now)) {
WARNING("meta[%u] next-pageno (%" PRIaPGNO
") is beyond end-pgno (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.first_unallocated, geo_now);
return MDBX_CORRUPTED;
}
if (meta->geometry.now != geo_now) {
WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO
" instead of wrong %" PRIaPGNO
", will be corrected on next commit(s)",
meta_number, "now", geo_now, meta->geometry.now);
meta->geometry.now = geo_now;
}
/* GC */
if (meta->trees.gc.root == P_INVALID) {
if (unlikely(meta->trees.gc.branch_pages || meta->trees.gc.height ||
meta->trees.gc.items || meta->trees.gc.leaf_pages ||
meta->trees.gc.large_pages)) {
WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC");
return MDBX_CORRUPTED;
}
} else if (unlikely(meta->trees.gc.root >=
meta->geometry.first_unallocated)) {
WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
"GC", meta->trees.gc.root);
return MDBX_CORRUPTED;
}
/* MainDB */
if (meta->trees.main.root == P_INVALID) {
if (unlikely(meta->trees.main.branch_pages || meta->trees.main.height ||
meta->trees.main.items || meta->trees.main.leaf_pages ||
meta->trees.main.large_pages)) {
WARNING("meta[%u] has false-empty %s", meta_number, "MainDB");
return MDBX_CORRUPTED;
}
} else if (unlikely(meta->trees.main.root >=
meta->geometry.first_unallocated)) {
WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
"MainDB", meta->trees.main.root);
return MDBX_CORRUPTED;
}
if (unlikely(meta->trees.gc.mod_txnid > txnid)) {
WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it",
meta_number, meta->trees.gc.mod_txnid, "GC");
return MDBX_CORRUPTED;
}
if (unlikely(meta->trees.main.mod_txnid > txnid)) {
WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it",
meta_number, meta->trees.main.mod_txnid, "MainDB");
return MDBX_CORRUPTED;
}
return MDBX_SUCCESS;
}
__cold int meta_validate_copy(MDBX_env *env, const meta_t *meta, meta_t *dest) {
*dest = *meta;
return meta_validate(env, dest, data_page(meta),
bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)),
nullptr);
}

203
src/meta.h Normal file
View File

@ -0,0 +1,203 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
static inline uint64_t meta_sign_calculate(const meta_t *meta) {
uint64_t sign = DATASIGN_NONE;
#if 0 /* TODO */
sign = hippeus_hash64(...);
#else
(void)meta;
#endif
/* LY: newer returns DATASIGN_NONE or DATASIGN_WEAK */
return (sign > DATASIGN_WEAK) ? sign : ~sign;
}
static inline uint64_t meta_sign_get(const volatile meta_t *meta) {
return unaligned_peek_u64_volatile(4, meta->sign);
}
static inline void meta_sign_as_steady(meta_t *meta) {
unaligned_poke_u64(4, meta->sign, meta_sign_calculate(meta));
}
static inline bool meta_is_steady(const volatile meta_t *meta) {
return SIGN_IS_STEADY(meta_sign_get(meta));
}
MDBX_INTERNAL troika_t meta_tap(const MDBX_env *env);
MDBX_INTERNAL unsigned meta_eq_mask(const troika_t *troika);
MDBX_INTERNAL bool meta_should_retry(const MDBX_env *env, troika_t *troika);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool troika_verify_fsm(void);
struct meta_ptr {
txnid_t txnid;
union {
const volatile meta_t *ptr_v;
const meta_t *ptr_c;
};
size_t is_steady;
};
MDBX_INTERNAL meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n);
MDBX_INTERNAL txnid_t meta_txnid(const volatile meta_t *meta);
MDBX_INTERNAL txnid_t recent_committed_txnid(const MDBX_env *env);
MDBX_INTERNAL int meta_sync(const MDBX_env *env, const meta_ptr_t head);
MDBX_INTERNAL const char *durable_caption(const meta_t *const meta);
MDBX_INTERNAL void meta_troika_dump(const MDBX_env *env,
const troika_t *troika);
#define METAPAGE(env, n) page_meta(pgno2page(env, n))
#define METAPAGE_END(env) METAPAGE(env, NUM_METAS)
static inline meta_ptr_t meta_recent(const MDBX_env *env,
const troika_t *troika) {
meta_ptr_t r;
r.txnid = troika->txnid[troika->recent];
r.ptr_v = METAPAGE(env, troika->recent);
r.is_steady = (troika->fsm >> troika->recent) & 1;
return r;
}
static inline meta_ptr_t meta_prefer_steady(const MDBX_env *env,
const troika_t *troika) {
meta_ptr_t r;
r.txnid = troika->txnid[troika->prefer_steady];
r.ptr_v = METAPAGE(env, troika->prefer_steady);
r.is_steady = (troika->fsm >> troika->prefer_steady) & 1;
return r;
}
static inline meta_ptr_t meta_tail(const MDBX_env *env,
const troika_t *troika) {
const uint8_t tail = troika->tail_and_flags & 3;
MDBX_ANALYSIS_ASSUME(tail < NUM_METAS);
meta_ptr_t r;
r.txnid = troika->txnid[tail];
r.ptr_v = METAPAGE(env, tail);
r.is_steady = (troika->fsm >> tail) & 1;
return r;
}
static inline bool meta_bootid_match(const meta_t *meta) {
return memcmp(&meta->bootid, &globals.bootid, 16) == 0 &&
(globals.bootid.x | globals.bootid.y) != 0;
}
static inline bool meta_weak_acceptable(const MDBX_env *env, const meta_t *meta,
const int lck_exclusive) {
return lck_exclusive
? /* exclusive lock */ meta_bootid_match(meta)
: /* db already opened */ env->lck_mmap.lck &&
(env->lck_mmap.lck->envmode.weak & MDBX_RDONLY) == 0;
}
MDBX_NOTHROW_PURE_FUNCTION static inline txnid_t
constmeta_txnid(const meta_t *meta) {
const txnid_t a = unaligned_peek_u64(4, &meta->txnid_a);
const txnid_t b = unaligned_peek_u64(4, &meta->txnid_b);
return likely(a == b) ? a : 0;
}
static inline void meta_update_begin(const MDBX_env *env, meta_t *meta,
txnid_t txnid) {
eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
eASSERT(env, unaligned_peek_u64(4, meta->txnid_a) < txnid &&
unaligned_peek_u64(4, meta->txnid_b) < txnid);
(void)env;
#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \
MDBX_UNALIGNED_OK >= 8
atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_b, 0, mo_AcquireRelease);
atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_a, txnid,
mo_AcquireRelease);
#else
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], 0,
mo_AcquireRelease);
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], 0,
mo_AcquireRelease);
atomic_store32(&meta->txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__],
(uint32_t)txnid, mo_AcquireRelease);
atomic_store32(&meta->txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__],
(uint32_t)(txnid >> 32), mo_AcquireRelease);
#endif
}
static inline void meta_update_end(const MDBX_env *env, meta_t *meta,
txnid_t txnid) {
eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
eASSERT(env, unaligned_peek_u64(4, meta->txnid_a) == txnid);
eASSERT(env, unaligned_peek_u64(4, meta->txnid_b) < txnid);
(void)env;
jitter4testing(true);
memcpy(&meta->bootid, &globals.bootid, 16);
#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \
MDBX_UNALIGNED_OK >= 8
atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_b, txnid,
mo_AcquireRelease);
#else
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__],
(uint32_t)txnid, mo_AcquireRelease);
atomic_store32(&meta->txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__],
(uint32_t)(txnid >> 32), mo_AcquireRelease);
#endif
}
static inline void meta_set_txnid(const MDBX_env *env, meta_t *meta,
const txnid_t txnid) {
eASSERT(env, !env->dxb_mmap.base || meta < METAPAGE(env, 0) ||
meta >= METAPAGE_END(env));
(void)env;
/* update inconsistently since this function used ONLY for filling meta-image
* for writing, but not the actual meta-page */
memcpy(&meta->bootid, &globals.bootid, 16);
unaligned_poke_u64(4, meta->txnid_a, txnid);
unaligned_poke_u64(4, meta->txnid_b, txnid);
}
static inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) {
return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s;
}
static inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, bool a_steady,
bool b_steady) {
assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */);
return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady);
}
static inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, bool a_steady,
bool b_steady) {
assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */);
return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1);
}
static inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady,
txnid_t b_txnid, bool b_steady) {
return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady);
}
static inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady,
txnid_t b_txnid, bool b_steady) {
return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady);
}
MDBX_INTERNAL meta_t *meta_init_triplet(const MDBX_env *env, void *buffer);
MDBX_INTERNAL int meta_validate(MDBX_env *env, meta_t *const meta,
const page_t *const page,
const unsigned meta_number,
unsigned *guess_pagesize);
MDBX_INTERNAL int __must_check_result meta_validate_copy(MDBX_env *env,
const meta_t *meta,
meta_t *dest);
MDBX_INTERNAL int __must_check_result meta_override(MDBX_env *env,
size_t target,
txnid_t txnid,
const meta_t *shape);
MDBX_INTERNAL int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto);

252
src/misc.c Normal file
View File

@ -0,0 +1,252 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) {
if (volume <= 1024 * 1024 * 4ul)
return MDBX_RESULT_TRUE;
intptr_t pagesize, total_ram_pages;
int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr);
if (unlikely(err != MDBX_SUCCESS))
return err;
const int log2page = log2n_powerof2(pagesize);
const intptr_t volume_pages = (volume + pagesize - 1) >> log2page;
const intptr_t redundancy_pages =
(redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page)
: (intptr_t)(redundancy + pagesize - 1) >> log2page;
if (volume_pages >= total_ram_pages ||
volume_pages + redundancy_pages >= total_ram_pages)
return MDBX_RESULT_FALSE;
intptr_t avail_ram_pages;
err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages);
if (unlikely(err != MDBX_SUCCESS))
return err;
return (volume_pages + redundancy_pages >= avail_ram_pages)
? MDBX_RESULT_FALSE
: MDBX_RESULT_TRUE;
}
int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result,
uint64_t increment) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
rc = sdb_fetch(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
tree_t *dbs = &txn->dbs[dbi];
if (likely(result))
*result = dbs->sequence;
if (likely(increment > 0)) {
if (unlikely(dbi == FREE_DBI || (txn->flags & MDBX_TXN_RDONLY) != 0))
return MDBX_EACCESS;
uint64_t new = dbs->sequence + increment;
if (unlikely(new < increment))
return MDBX_RESULT_TRUE;
tASSERT(txn, new > dbs->sequence);
dbs->sequence = new;
txn->flags |= MDBX_TXN_DIRTY;
txn->dbi_state[dbi] |= DBI_DIRTY;
}
return MDBX_SUCCESS;
}
int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
const MDBX_val *b) {
eASSERT(nullptr, txn->signature == txn_signature);
tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi));
tASSERT(txn,
dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID) != 0);
return txn->env->kvs[dbi].clc.k.cmp(a, b);
}
int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
const MDBX_val *b) {
eASSERT(nullptr, txn->signature == txn_signature);
tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi));
tASSERT(txn, dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID));
return txn->env->kvs[dbi].clc.v.cmp(a, b);
}
__cold MDBX_cmp_func *mdbx_get_keycmp(MDBX_db_flags_t flags) {
return builtin_keycmp(flags);
}
__cold MDBX_cmp_func *mdbx_get_datacmp(MDBX_db_flags_t flags) {
return builtin_datacmp(flags);
}
/*----------------------------------------------------------------------------*/
__cold const char *mdbx_liberr2str(int errnum) {
/* Table of descriptions for MDBX errors */
static const char *const tbl[] = {
"MDBX_KEYEXIST: Key/data pair already exists",
"MDBX_NOTFOUND: No matching key/data pair found",
"MDBX_PAGE_NOTFOUND: Requested page not found",
"MDBX_CORRUPTED: Database is corrupted",
"MDBX_PANIC: Environment had fatal error",
"MDBX_VERSION_MISMATCH: DB version mismatch libmdbx",
"MDBX_INVALID: File is not an MDBX file",
"MDBX_MAP_FULL: Environment mapsize limit reached",
"MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)",
"MDBX_READERS_FULL: Too many readers (maxreaders reached)",
nullptr /* MDBX_TLS_FULL (-30789): unused in MDBX */,
"MDBX_TXN_FULL: Transaction has too many dirty pages,"
" i.e transaction is too big",
"MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates"
" corruption, i.e branch-pages loop",
"MDBX_PAGE_FULL: Internal error - Page has no more space",
"MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend"
" mapping, e.g. since address space is unavailable or busy,"
" or Operation system not supported such operations",
"MDBX_INCOMPATIBLE: Environment or database is not compatible"
" with the requested operation or the specified flags",
"MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot,"
" e.g. read-transaction already run for current thread",
"MDBX_BAD_TXN: Transaction is not valid for requested operation,"
" e.g. had errored and be must aborted, has a child, or is invalid",
"MDBX_BAD_VALSIZE: Invalid size or alignment of key or data"
" for target database, either invalid subDB name",
"MDBX_BAD_DBI: The specified DBI-handle is invalid"
" or changed by another thread/transaction",
"MDBX_PROBLEM: Unexpected internal error, transaction should be aborted",
"MDBX_BUSY: Another write transaction is running,"
" or environment is already used while opening with MDBX_EXCLUSIVE flag",
};
if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) {
int i = errnum - MDBX_KEYEXIST;
return tbl[i];
}
switch (errnum) {
case MDBX_SUCCESS:
return "MDBX_SUCCESS: Successful";
case MDBX_EMULTIVAL:
return "MDBX_EMULTIVAL: The specified key has"
" more than one associated value";
case MDBX_EBADSIGN:
return "MDBX_EBADSIGN: Wrong signature of a runtime object(s),"
" e.g. memory corruption or double-free";
case MDBX_WANNA_RECOVERY:
return "MDBX_WANNA_RECOVERY: Database should be recovered,"
" but this could NOT be done automatically for now"
" since it opened in read-only mode";
case MDBX_EKEYMISMATCH:
return "MDBX_EKEYMISMATCH: The given key value is mismatched to the"
" current cursor position";
case MDBX_TOO_LARGE:
return "MDBX_TOO_LARGE: Database is too large for current system,"
" e.g. could NOT be mapped into RAM";
case MDBX_THREAD_MISMATCH:
return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not"
" owned object, e.g. a transaction that started by another thread";
case MDBX_TXN_OVERLAPPING:
return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for"
" the current thread";
case MDBX_DUPLICATED_CLK:
return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists,"
" please keep one and remove unused other";
case MDBX_DANGLING_DBI:
return "MDBX_DANGLING_DBI: Some cursors and/or other resources should be"
" closed before subDb or corresponding DBI-handle could be (re)used";
default:
return nullptr;
}
}
__cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) {
const char *msg = mdbx_liberr2str(errnum);
if (!msg && buflen > 0 && buflen < INT_MAX) {
#if defined(_WIN32) || defined(_WIN64)
const DWORD size = FormatMessageA(
FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr,
errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
nullptr);
return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
#elif defined(_GNU_SOURCE) && defined(__GLIBC__)
/* GNU-specific */
if (errnum > 0)
msg = strerror_r(errnum, buf, buflen);
#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600)
/* XSI-compliant */
if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0)
msg = buf;
#else
if (errnum > 0) {
msg = strerror(errnum);
if (msg) {
strncpy(buf, msg, buflen);
msg = buf;
}
}
#endif
if (!msg) {
(void)snprintf(buf, buflen, "error %d", errnum);
msg = buf;
}
buf[buflen - 1] = '\0';
}
return msg;
}
__cold const char *mdbx_strerror(int errnum) {
#if defined(_WIN32) || defined(_WIN64)
static char buf[1024];
return mdbx_strerror_r(errnum, buf, sizeof(buf));
#else
const char *msg = mdbx_liberr2str(errnum);
if (!msg) {
if (errnum > 0)
msg = strerror(errnum);
if (!msg) {
static char buf[32];
(void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum);
msg = buf;
}
}
return msg;
#endif
}
#if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */
const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) {
const char *msg = mdbx_liberr2str(errnum);
if (!msg && buflen > 0 && buflen < INT_MAX) {
const DWORD size = FormatMessageA(
FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr,
errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
nullptr);
if (!size)
msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
else if (!CharToOemBuffA(buf, buf, size))
msg = "CharToOemBuffA() failed";
else
msg = buf;
}
return msg;
}
const char *mdbx_strerror_ANSI2OEM(int errnum) {
static char buf[1024];
return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf));
}
#endif /* Bit of madness for Windows */

477
src/mvcc-readers.c Normal file
View File

@ -0,0 +1,477 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
bsr_t mvcc_bind_slot(MDBX_env *env, const uintptr_t tid) {
eASSERT(env, env->lck_mmap.lck);
eASSERT(env, env->lck->magic_and_version == MDBX_LOCK_MAGIC);
eASSERT(env, env->lck->os_and_format == MDBX_LOCK_FORMAT);
bsr_t result = {lck_rdt_lock(env), nullptr};
if (unlikely(MDBX_IS_ERROR(result.err)))
return result;
if (unlikely(env->flags & ENV_FATAL_ERROR)) {
lck_rdt_unlock(env);
result.err = MDBX_PANIC;
return result;
}
if (unlikely(!env->dxb_mmap.base)) {
lck_rdt_unlock(env);
result.err = MDBX_EPERM;
return result;
}
if (unlikely(env->registered_reader_pid != env->pid)) {
result.err = lck_rpid_set(env);
if (unlikely(result.err != MDBX_SUCCESS)) {
lck_rdt_unlock(env);
return result;
}
env->registered_reader_pid = env->pid;
}
result.err = MDBX_SUCCESS;
size_t slot, nreaders;
while (1) {
nreaders = env->lck->rdt_length.weak;
for (slot = 0; slot < nreaders; slot++)
if (!atomic_load32(&env->lck->rdt[slot].pid, mo_AcquireRelease))
break;
if (likely(slot < env->max_readers))
break;
result.err = mvcc_cleanup_dead(env, true, nullptr);
if (result.err != MDBX_RESULT_TRUE) {
lck_rdt_unlock(env);
result.err =
(result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err;
return result;
}
}
result.rslot = &env->lck->rdt[slot];
/* Claim the reader slot, carefully since other code
* uses the reader table un-mutexed: First reset the
* slot, next publish it in lck->rdt_length. After
* that, it is safe for mdbx_env_close() to touch it.
* When it will be closed, we can finally claim it. */
atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease);
safe64_reset(&result.rslot->txnid, true);
if (slot == nreaders)
env->lck->rdt_length.weak = (uint32_t)++nreaders;
result.rslot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : tid;
atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease);
lck_rdt_unlock(env);
if (likely(env->flags & ENV_TXKEY)) {
eASSERT(env, env->registered_reader_pid == env->pid);
thread_rthc_set(env->me_txkey, result.rslot);
}
return result;
}
__hot txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady) {
const uint32_t nothing_changed = MDBX_STRING_TETRAD("None");
eASSERT(env, steady <= env->basal_txn->txnid);
lck_t *const lck = env->lck_mmap.lck;
if (unlikely(lck == nullptr /* exclusive without-lck mode */)) {
eASSERT(env, env->lck == lckless_stub(env));
env->lck->rdt_refresh_flag.weak = nothing_changed;
return env->lck->cached_oldest.weak = steady;
}
const txnid_t prev_oldest =
atomic_load64(&lck->cached_oldest, mo_AcquireRelease);
eASSERT(env, steady >= prev_oldest);
txnid_t new_oldest = prev_oldest;
while (nothing_changed !=
atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) {
lck->rdt_refresh_flag.weak = nothing_changed;
jitter4testing(false);
const size_t snap_nreaders =
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
new_oldest = steady;
for (size_t i = 0; i < snap_nreaders; ++i) {
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
if (!pid)
continue;
jitter4testing(true);
const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid);
if (unlikely(rtxn < prev_oldest)) {
if (unlikely(nothing_changed == atomic_load32(&lck->rdt_refresh_flag,
mo_AcquireRelease)) &&
safe64_reset_compare(&lck->rdt[i].txnid, rtxn)) {
NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN
" < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN,
i, snap_nreaders, pid, rtxn, prev_oldest, steady);
}
continue;
}
if (rtxn < new_oldest) {
new_oldest = rtxn;
if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest)
break;
}
}
}
if (new_oldest != prev_oldest) {
VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest);
eASSERT(env, new_oldest >= lck->cached_oldest.weak);
atomic_store64(&lck->cached_oldest, new_oldest, mo_Relaxed);
}
return new_oldest;
}
pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page) {
lck_t *const lck = env->lck_mmap.lck;
if (likely(lck != nullptr /* check for exclusive without-lck mode */)) {
retry:;
const size_t snap_nreaders =
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
for (size_t i = 0; i < snap_nreaders; ++i) {
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
/* jitter4testing(true); */
const pgno_t snap_pages =
atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
if (unlikely(snap_pages !=
atomic_load32(&lck->rdt[i].snapshot_pages_used,
mo_AcquireRelease) ||
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
goto retry;
if (last_used_page < snap_pages && snap_txnid <= env->basal_txn->txnid)
last_used_page = snap_pages;
}
}
}
return last_used_page;
}
/* Find largest mvcc-snapshot still referenced by this process. */
pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest) {
lck_t *const lck = env->lck_mmap.lck;
if (likely(lck != nullptr /* exclusive mode */)) {
const size_t snap_nreaders =
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
for (size_t i = 0; i < snap_nreaders; ++i) {
retry:
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease) == env->pid) {
/* jitter4testing(true); */
const pgno_t snap_pages =
atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
if (unlikely(snap_pages !=
atomic_load32(&lck->rdt[i].snapshot_pages_used,
mo_AcquireRelease) ||
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
goto retry;
if (largest < snap_pages &&
atomic_load64(&lck->cached_oldest, mo_AcquireRelease) <=
/* ignore pending updates */ snap_txnid &&
snap_txnid <= MAX_TXNID)
largest = snap_pages;
}
}
}
return largest;
}
static bool pid_insert(uint32_t *list, uint32_t pid) {
/* binary search of pid in list */
size_t base = 0;
size_t cursor = 1;
int32_t val = 0;
size_t n = /* length */ list[0];
while (n > 0) {
size_t pivot = n >> 1;
cursor = base + pivot + 1;
val = pid - list[cursor];
if (val < 0) {
n = pivot;
} else if (val > 0) {
base = cursor;
n -= pivot + 1;
} else {
/* found, so it's a duplicate */
return false;
}
}
if (val > 0)
++cursor;
list[0]++;
for (n = list[0]; n > cursor; n--)
list[n] = list[n - 1];
list[n] = pid;
return true;
}
__cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked,
int *dead) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
eASSERT(env, rdt_locked >= 0);
lck_t *const lck = env->lck_mmap.lck;
if (unlikely(lck == nullptr)) {
/* exclusive mode */
if (dead)
*dead = 0;
return MDBX_SUCCESS;
}
const size_t snap_nreaders =
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
uint32_t pidsbuf_onstask[142];
uint32_t *const pids =
(snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask))
? pidsbuf_onstask
: osal_malloc((snap_nreaders + 1) * sizeof(uint32_t));
if (unlikely(!pids))
return MDBX_ENOMEM;
pids[0] = 0;
int count = 0;
for (size_t i = 0; i < snap_nreaders; i++) {
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
if (pid == 0)
continue /* skip empty */;
if (pid == env->pid)
continue /* skip self */;
if (!pid_insert(pids, pid))
continue /* such pid already processed */;
int err = lck_rpid_check(env, pid);
if (err == MDBX_RESULT_TRUE)
continue /* reader is live */;
if (err != MDBX_SUCCESS) {
rc = err;
break /* lck_rpid_check() failed */;
}
/* stale reader found */
if (!rdt_locked) {
err = lck_rdt_lock(env);
if (MDBX_IS_ERROR(err)) {
rc = err;
break;
}
rdt_locked = -1;
if (err == MDBX_RESULT_TRUE) {
/* mutex recovered, the mdbx_ipclock_failed() checked all readers */
rc = MDBX_RESULT_TRUE;
break;
}
/* a other process may have clean and reused slot, recheck */
if (lck->rdt[i].pid.weak != pid)
continue;
err = lck_rpid_check(env, pid);
if (MDBX_IS_ERROR(err)) {
rc = err;
break;
}
if (err != MDBX_SUCCESS)
continue /* the race with other process, slot reused */;
}
/* clean it */
for (size_t ii = i; ii < snap_nreaders; ii++) {
if (lck->rdt[ii].pid.weak == pid) {
DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid,
lck->rdt[ii].txnid.weak);
atomic_store32(&lck->rdt[ii].pid, 0, mo_Relaxed);
atomic_store32(&lck->rdt_refresh_flag, true, mo_AcquireRelease);
count++;
}
}
}
if (likely(!MDBX_IS_ERROR(rc)))
atomic_store64(&lck->readers_check_timestamp, osal_monotime(), mo_Relaxed);
if (rdt_locked < 0)
lck_rdt_unlock(env);
if (pids != pidsbuf_onstask)
osal_free(pids);
if (dead)
*dead = count;
return rc;
}
__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler);
osal_memory_fence(mo_AcquireRelease, false);
MDBX_hsr_func *const callback = env->hsr_callback;
txnid_t oldest = 0;
bool notify_eof_of_loop = false;
int retry = 0;
do {
const txnid_t steady =
env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady];
env->lck->rdt_refresh_flag.weak = /* force refresh */ true;
oldest = mvcc_shapshot_oldest(env, steady);
eASSERT(env, oldest < env->basal_txn->txnid);
eASSERT(env, oldest >= straggler);
eASSERT(env, oldest >= env->lck->cached_oldest.weak);
lck_t *const lck = env->lck_mmap.lck;
if (oldest == steady || oldest > straggler || /* without-LCK mode */ !lck)
break;
if (MDBX_IS_ERROR(mvcc_cleanup_dead(env, false, nullptr)))
break;
if (!callback)
break;
reader_slot_t *stucked = nullptr;
uint64_t hold_retired = 0;
for (size_t i = 0; i < lck->rdt_length.weak; ++i) {
const uint64_t snap_retired =
atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed);
const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid);
if (rtxn == straggler &&
atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
hold_retired = snap_retired;
stucked = &lck->rdt[i];
}
}
if (!stucked)
break;
uint32_t pid = atomic_load32(&stucked->pid, mo_AcquireRelease);
uint64_t tid = atomic_load64(&stucked->tid, mo_AcquireRelease);
if (safe64_read(&stucked->txnid) != straggler || !pid ||
stucked->snapshot_pages_retired.weak != hold_retired)
continue;
const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika);
const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP;
const uint64_t head_retired =
unaligned_peek_u64(4, head.ptr_c->pages_retired);
const size_t space =
(head_retired > hold_retired)
? pgno2bytes(env, (pgno_t)(head_retired - hold_retired))
: 0;
int rc =
callback(env, env->txn, pid, (mdbx_tid_t)((intptr_t)tid), straggler,
(gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry);
if (rc < 0)
/* hsr returned error and/or agree MDBX_MAP_FULL error */
break;
if (rc > 0) {
if (rc == 1) {
/* hsr reported transaction (will be) aborted asynchronous */
safe64_reset_compare(&stucked->txnid, straggler);
} else {
/* hsr reported reader process was killed and slot should be cleared */
safe64_reset(&stucked->txnid, true);
atomic_store64(&stucked->tid, 0, mo_Relaxed);
atomic_store32(&stucked->pid, 0, mo_AcquireRelease);
}
} else if (!notify_eof_of_loop) {
#if MDBX_ENABLE_PROFGC
env->lck->pgops.gc_prof.kicks += 1;
#endif /* MDBX_ENABLE_PROFGC */
notify_eof_of_loop = true;
}
} while (++retry < INT_MAX);
if (notify_eof_of_loop) {
/* notify end of hsr-loop */
const txnid_t turn = oldest - straggler;
if (turn)
NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN,
straggler, oldest, turn);
callback(env, env->txn, 0, 0, straggler,
(turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
}
return oldest;
}
/*----------------------------------------------------------------------------*/
__cold int mdbx_thread_register(const MDBX_env *env) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!env->lck_mmap.lck))
return (env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM;
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
return MDBX_EINVAL /* MDBX_NOSTICKYTHREADS mode */;
}
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
reader_slot_t *r = thread_rthc_get(env->me_txkey);
if (unlikely(r != nullptr)) {
eASSERT(env, r->pid.weak == env->pid);
eASSERT(env, r->tid.weak == osal_thread_self());
if (unlikely(r->pid.weak != env->pid))
return MDBX_BAD_RSLOT;
return MDBX_RESULT_TRUE /* already registered */;
}
const uintptr_t tid = osal_thread_self();
if (env->txn && unlikely(env->basal_txn->owner == tid))
return MDBX_TXN_OVERLAPPING;
return mvcc_bind_slot((MDBX_env *)env, tid).err;
}
__cold int mdbx_thread_unregister(const MDBX_env *env) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!env->lck_mmap.lck))
return MDBX_RESULT_TRUE;
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */;
}
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
reader_slot_t *r = thread_rthc_get(env->me_txkey);
if (unlikely(r == nullptr))
return MDBX_RESULT_TRUE /* not registered */;
eASSERT(env, r->pid.weak == env->pid);
eASSERT(env, r->tid.weak == osal_thread_self());
if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self()))
return MDBX_BAD_RSLOT;
eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD);
if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD))
return MDBX_BUSY /* transaction is still active */;
atomic_store32(&r->pid, 0, mo_Relaxed);
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease);
thread_rthc_set(env->me_txkey, nullptr);
return MDBX_SUCCESS;
}

395
src/node.c Normal file
View File

@ -0,0 +1,395 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__hot int __must_check_result node_add_dupfix(MDBX_cursor *mc, size_t indx,
const MDBX_val *key) {
page_t *mp = mc->pg[mc->top];
MDBX_ANALYSIS_ASSUME(key != nullptr);
DKBUF_DEBUG;
DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, "
" key size %" PRIuPTR " [%s]",
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, key ? key->iov_len : 0,
DKEY_DEBUG(key));
cASSERT(mc, key);
cASSERT(mc, page_type_compat(mp) == (P_LEAF | P_DUPFIX));
const size_t ksize = mc->tree->dupfix_size;
cASSERT(mc, ksize == key->iov_len);
const size_t nkeys = page_numkeys(mp);
cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0);
/* Just using these for counting */
const intptr_t lower = mp->lower + sizeof(indx_t);
const intptr_t upper = mp->upper - (ksize - sizeof(indx_t));
if (unlikely(lower > upper)) {
mc->txn->flags |= MDBX_TXN_ERROR;
return MDBX_PAGE_FULL;
}
mp->lower = (indx_t)lower;
mp->upper = (indx_t)upper;
void *const ptr = page_dupfix_ptr(mp, indx, ksize);
cASSERT(mc, nkeys >= indx);
const size_t diff = nkeys - indx;
if (likely(diff > 0))
/* Move higher keys up one slot. */
memmove(ptr_disp(ptr, ksize), ptr, diff * ksize);
/* insert new key */
memcpy(ptr, key->iov_base, ksize);
cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0);
return MDBX_SUCCESS;
}
int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx,
const MDBX_val *key, pgno_t pgno) {
page_t *mp = mc->pg[mc->top];
DKBUF_DEBUG;
DEBUG("add to branch-%spage %" PRIaPGNO " index %zi, node-pgno %" PRIaPGNO
" key size %" PRIuPTR " [%s]",
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, pgno,
key ? key->iov_len : 0, DKEY_DEBUG(key));
cASSERT(mc, page_type(mp) == P_BRANCH);
STATIC_ASSERT(NODESIZE % 2 == 0);
/* Move higher pointers up one slot. */
const size_t nkeys = page_numkeys(mp);
cASSERT(mc, nkeys >= indx);
for (size_t i = nkeys; i > indx; --i)
mp->entries[i] = mp->entries[i - 1];
/* Adjust free space offsets. */
const size_t branch_bytes = branch_size(mc->txn->env, key);
const intptr_t lower = mp->lower + sizeof(indx_t);
const intptr_t upper = mp->upper - (branch_bytes - sizeof(indx_t));
if (unlikely(lower > upper)) {
mc->txn->flags |= MDBX_TXN_ERROR;
return MDBX_PAGE_FULL;
}
mp->lower = (indx_t)lower;
mp->entries[indx] = mp->upper = (indx_t)upper;
/* Write the node data. */
node_t *node = page_node(mp, indx);
node_set_pgno(node, pgno);
node_set_flags(node, 0);
UNALIGNED_POKE_8(node, node_t, extra, 0);
node_set_ks(node, 0);
if (likely(key != nullptr)) {
node_set_ks(node, key->iov_len);
memcpy(node_key(node), key->iov_base, key->iov_len);
}
return MDBX_SUCCESS;
}
__hot int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx,
const MDBX_val *key, MDBX_val *data,
unsigned flags) {
MDBX_ANALYSIS_ASSUME(key != nullptr);
MDBX_ANALYSIS_ASSUME(data != nullptr);
page_t *mp = mc->pg[mc->top];
DKBUF_DEBUG;
DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR
" key size %" PRIuPTR " [%s]",
is_subpage(mp) ? "sub-" : "", mp->pgno, indx, data ? data->iov_len : 0,
key ? key->iov_len : 0, DKEY_DEBUG(key));
cASSERT(mc, key != nullptr && data != nullptr);
cASSERT(mc, page_type_compat(mp) == P_LEAF);
page_t *largepage = nullptr;
size_t node_bytes;
if (unlikely(flags & N_BIGDATA)) {
/* Data already on large/overflow page. */
STATIC_ASSERT(sizeof(pgno_t) % 2 == 0);
node_bytes =
node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
cASSERT(mc, page_room(mp) >= node_bytes);
} else if (unlikely(node_size(key, data) > mc->txn->env->leaf_nodemax)) {
/* Put data on large/overflow page. */
if (unlikely(mc->tree->flags & MDBX_DUPSORT)) {
ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db",
mc->tree->flags);
return MDBX_PROBLEM;
}
if (unlikely(flags & (N_DUPDATA | N_SUBDATA))) {
ERROR("Unexpected target %s flags 0x%x for large data-item", "node",
flags);
return MDBX_PROBLEM;
}
cASSERT(mc, page_room(mp) >= leaf_size(mc->txn->env, key, data));
const pgno_t ovpages = largechunk_npages(mc->txn->env, data->iov_len);
const pgr_t npr = page_new_large(mc, ovpages);
if (unlikely(npr.err != MDBX_SUCCESS))
return npr.err;
largepage = npr.page;
DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR
" data bytes",
largepage->pages, largepage->pgno, data->iov_len);
flags |= N_BIGDATA;
node_bytes =
node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
cASSERT(mc, node_bytes == leaf_size(mc->txn->env, key, data));
} else {
cASSERT(mc, page_room(mp) >= leaf_size(mc->txn->env, key, data));
node_bytes = node_size(key, data) + sizeof(indx_t);
cASSERT(mc, node_bytes == leaf_size(mc->txn->env, key, data));
}
/* Move higher pointers up one slot. */
const size_t nkeys = page_numkeys(mp);
cASSERT(mc, nkeys >= indx);
for (size_t i = nkeys; i > indx; --i)
mp->entries[i] = mp->entries[i - 1];
/* Adjust free space offsets. */
const intptr_t lower = mp->lower + sizeof(indx_t);
const intptr_t upper = mp->upper - (node_bytes - sizeof(indx_t));
if (unlikely(lower > upper)) {
mc->txn->flags |= MDBX_TXN_ERROR;
return MDBX_PAGE_FULL;
}
mp->lower = (indx_t)lower;
mp->entries[indx] = mp->upper = (indx_t)upper;
/* Write the node data. */
node_t *node = page_node(mp, indx);
node_set_ks(node, key->iov_len);
node_set_flags(node, (uint8_t)flags);
UNALIGNED_POKE_8(node, node_t, extra, 0);
node_set_ds(node, data->iov_len);
memcpy(node_key(node), key->iov_base, key->iov_len);
void *nodedata = node_data(node);
if (likely(largepage == nullptr)) {
if (unlikely(flags & N_BIGDATA)) {
memcpy(nodedata, data->iov_base, sizeof(pgno_t));
return MDBX_SUCCESS;
}
} else {
poke_pgno(nodedata, largepage->pgno);
nodedata = page_data(largepage);
}
if (unlikely(flags & MDBX_RESERVE))
data->iov_base = nodedata;
else if (likely(data->iov_len /* to avoid UBSAN traps */))
memcpy(nodedata, data->iov_base, data->iov_len);
return MDBX_SUCCESS;
}
__hot void node_del(MDBX_cursor *mc, size_t ksize) {
page_t *mp = mc->pg[mc->top];
const size_t hole = mc->ki[mc->top];
const size_t nkeys = page_numkeys(mp);
DEBUG("delete node %zu on %s page %" PRIaPGNO, hole,
is_leaf(mp) ? "leaf" : "branch", mp->pgno);
cASSERT(mc, hole < nkeys);
if (is_dupfix_leaf(mp)) {
cASSERT(mc, ksize >= sizeof(indx_t));
size_t diff = nkeys - 1 - hole;
void *const base = page_dupfix_ptr(mp, hole, ksize);
if (diff)
memmove(base, ptr_disp(base, ksize), diff * ksize);
cASSERT(mc, mp->lower >= sizeof(indx_t));
mp->lower -= sizeof(indx_t);
cASSERT(mc, (size_t)UINT16_MAX - mp->upper >= ksize - sizeof(indx_t));
mp->upper += (indx_t)(ksize - sizeof(indx_t));
cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0);
return;
}
node_t *node = page_node(mp, hole);
cASSERT(mc, !is_branch(mp) || hole || node_ks(node) == 0);
size_t hole_size = NODESIZE + node_ks(node);
if (is_leaf(mp))
hole_size +=
(node_flags(node) & N_BIGDATA) ? sizeof(pgno_t) : node_ds(node);
hole_size = EVEN_CEIL(hole_size);
const indx_t hole_offset = mp->entries[hole];
size_t r, w;
for (r = w = 0; r < nkeys; r++)
if (r != hole)
mp->entries[w++] = (mp->entries[r] < hole_offset)
? mp->entries[r] + (indx_t)hole_size
: mp->entries[r];
void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ);
memmove(ptr_disp(base, hole_size), base, hole_offset - mp->upper);
cASSERT(mc, mp->lower >= sizeof(indx_t));
mp->lower -= sizeof(indx_t);
cASSERT(mc, (size_t)UINT16_MAX - mp->upper >= hole_size);
mp->upper += (indx_t)hole_size;
if (AUDIT_ENABLED()) {
const uint8_t checking = mc->checking;
mc->checking |= z_updating;
const int page_check_err = page_check(mc, mp);
mc->checking = checking;
cASSERT(mc, page_check_err == MDBX_SUCCESS);
}
}
__noinline int node_read_bigdata(MDBX_cursor *mc, const node_t *node,
MDBX_val *data, const page_t *mp) {
cASSERT(mc, node_flags(node) == N_BIGDATA && data->iov_len == node_ds(node));
pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->txnid);
if (unlikely((lp.err != MDBX_SUCCESS))) {
DEBUG("read large/overflow page %" PRIaPGNO " failed",
node_largedata_pgno(node));
return lp.err;
}
cASSERT(mc, page_type(lp.page) == P_LARGE);
data->iov_base = page_data(lp.page);
if (!MDBX_DISABLE_VALIDATION) {
const MDBX_env *env = mc->txn->env;
const size_t dsize = data->iov_len;
const unsigned npages = largechunk_npages(env, dsize);
if (unlikely(lp.page->pages < npages))
return bad_page(lp.page,
"too less n-pages %u for bigdata-node (%zu bytes)",
lp.page->pages, dsize);
}
return MDBX_SUCCESS;
}
node_t *node_shrink(page_t *mp, size_t indx, node_t *node) {
assert(node == page_node(mp, indx));
page_t *sp = (page_t *)node_data(node);
assert(is_subpage(sp) && page_numkeys(sp) > 0);
const size_t delta =
EVEN_FLOOR(page_room(sp) /* avoid the node uneven-sized */);
if (unlikely(delta) == 0)
return node;
/* Prepare to shift upward, set len = length(subpage part to shift) */
size_t nsize = node_ds(node) - delta, len = nsize;
assert(nsize % 1 == 0);
if (!is_dupfix_leaf(sp)) {
len = PAGEHDRSZ;
page_t *xp = ptr_disp(sp, delta); /* destination subpage */
for (intptr_t i = page_numkeys(sp); --i >= 0;) {
assert(sp->entries[i] >= delta);
xp->entries[i] = (indx_t)(sp->entries[i] - delta);
}
}
assert(sp->upper >= sp->lower + delta);
sp->upper -= (indx_t)delta;
sp->pgno = mp->pgno;
node_set_ds(node, nsize);
/* Shift <lower nodes...initial part of subpage> upward */
void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ);
memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len);
const size_t pivot = mp->entries[indx];
for (intptr_t i = page_numkeys(mp); --i >= 0;) {
if (mp->entries[i] <= pivot) {
assert((size_t)UINT16_MAX - mp->entries[i] >= delta);
mp->entries[i] += (indx_t)delta;
}
}
assert((size_t)UINT16_MAX - mp->upper >= delta);
mp->upper += (indx_t)delta;
return ptr_disp(node, delta);
}
__hot struct node_search_result node_search(MDBX_cursor *mc,
const MDBX_val *key) {
page_t *mp = mc->pg[mc->top];
const intptr_t nkeys = page_numkeys(mp);
DKBUF_DEBUG;
DEBUG("searching %zu keys in %s %spage %" PRIaPGNO, nkeys,
is_leaf(mp) ? "leaf" : "branch", is_subpage(mp) ? "sub-" : "",
mp->pgno);
struct node_search_result ret;
ret.exact = false;
STATIC_ASSERT(P_BRANCH == 1);
intptr_t low = mp->flags & P_BRANCH;
intptr_t high = nkeys - 1;
if (unlikely(high < low)) {
mc->ki[mc->top] = 0;
ret.node = nullptr;
return ret;
}
intptr_t i;
MDBX_cmp_func *cmp = mc->clc->k.cmp;
MDBX_val nodekey;
if (unlikely(is_dupfix_leaf(mp))) {
cASSERT(mc, mp->dupfix_ksize == mc->tree->dupfix_size);
nodekey.iov_len = mp->dupfix_ksize;
do {
i = (low + high) >> 1;
nodekey.iov_base = page_dupfix_ptr(mp, i, nodekey.iov_len);
cASSERT(mc, ptr_disp(mp, mc->txn->env->ps) >=
ptr_disp(nodekey.iov_base, nodekey.iov_len));
int cr = cmp(key, &nodekey);
DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr);
if (cr > 0)
low = ++i;
else if (cr < 0)
high = i - 1;
else {
ret.exact = true;
break;
}
} while (likely(low <= high));
/* store the key index */
mc->ki[mc->top] = (indx_t)i;
ret.node =
(i < nkeys)
? /* fake for DUPFIX */ (node_t *)(intptr_t)-1
: /* There is no entry larger or equal to the key. */ nullptr;
return ret;
}
if (MDBX_UNALIGNED_OK < 4 && is_branch(mp) && cmp == cmp_int_align2)
/* Branch pages have no data, so if using integer keys,
* alignment is guaranteed. Use faster cmp_int_align4(). */
cmp = cmp_int_align4;
node_t *node;
do {
i = (low + high) >> 1;
node = page_node(mp, i);
nodekey.iov_len = node_ks(node);
nodekey.iov_base = node_key(node);
cASSERT(mc, ptr_disp(mp, mc->txn->env->ps) >=
ptr_disp(nodekey.iov_base, nodekey.iov_len));
int cr = cmp(key, &nodekey);
if (is_leaf(mp))
DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr);
else
DEBUG("found branch index %zu [%s -> %" PRIaPGNO "], rc = %i", i,
DKEY_DEBUG(&nodekey), node_pgno(node), cr);
if (cr > 0)
low = ++i;
else if (cr < 0)
high = i - 1;
else {
ret.exact = true;
break;
}
} while (likely(low <= high));
/* store the key index */
mc->ki[mc->top] = (indx_t)i;
ret.node = (i < nkeys)
? page_node(mp, i)
: /* There is no entry larger or equal to the key. */ nullptr;
return ret;
}

125
src/node.h Normal file
View File

@ -0,0 +1,125 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
/* valid flags for mdbx_node_add() */
#define NODE_ADD_FLAGS (N_DUPDATA | N_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
/* Get the page number pointed to by a branch node */
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
node_pgno(const node_t *const __restrict node) {
pgno_t pgno = UNALIGNED_PEEK_32(node, node_t, child_pgno);
return pgno;
}
/* Set the page number in a branch node */
static inline void node_set_pgno(node_t *const __restrict node, pgno_t pgno) {
assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO);
UNALIGNED_POKE_32(node, node_t, child_pgno, (uint32_t)pgno);
}
/* Get the size of the data in a leaf node */
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
node_ds(const node_t *const __restrict node) {
return UNALIGNED_PEEK_32(node, node_t, dsize);
}
/* Set the size of the data for a leaf node */
static inline void node_set_ds(node_t *const __restrict node, size_t size) {
assert(size < INT_MAX);
UNALIGNED_POKE_32(node, node_t, dsize, (uint32_t)size);
}
/* The size of a key in a node */
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
node_ks(const node_t *const __restrict node) {
return UNALIGNED_PEEK_16(node, node_t, ksize);
}
/* Set the size of the key for a leaf node */
static inline void node_set_ks(node_t *const __restrict node, size_t size) {
assert(size < INT16_MAX);
UNALIGNED_POKE_16(node, node_t, ksize, (uint16_t)size);
}
MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
node_flags(const node_t *const __restrict node) {
return UNALIGNED_PEEK_8(node, node_t, flags);
}
static inline void node_set_flags(node_t *const __restrict node,
uint8_t flags) {
UNALIGNED_POKE_8(node, node_t, flags, flags);
}
/* Address of the key for the node */
MDBX_NOTHROW_PURE_FUNCTION static inline void *
node_key(const node_t *const __restrict node) {
return ptr_disp(node, NODESIZE);
}
/* Address of the data for a node */
MDBX_NOTHROW_PURE_FUNCTION static inline void *
node_data(const node_t *const __restrict node) {
return ptr_disp(node_key(node), node_ks(node));
}
/* Size of a node in a leaf page with a given key and data.
* This is node header plus key plus data size. */
MDBX_NOTHROW_CONST_FUNCTION static inline size_t
node_size_len(const size_t key_len, const size_t value_len) {
return NODESIZE + EVEN_CEIL(key_len + value_len);
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t
node_size(const MDBX_val *key, const MDBX_val *value) {
return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0);
}
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
node_largedata_pgno(const node_t *const __restrict node) {
assert(node_flags(node) & N_BIGDATA);
return peek_pgno(node_data(node));
}
MDBX_INTERNAL int __must_check_result node_read_bigdata(MDBX_cursor *mc,
const node_t *node,
MDBX_val *data,
const page_t *mp);
static inline int __must_check_result node_read(MDBX_cursor *mc,
const node_t *node,
MDBX_val *data,
const page_t *mp) {
data->iov_len = node_ds(node);
data->iov_base = node_data(node);
if (likely(node_flags(node) != N_BIGDATA))
return MDBX_SUCCESS;
return node_read_bigdata(mc, node, data, mp);
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL nsr_t node_search(MDBX_cursor *mc, const MDBX_val *key);
MDBX_INTERNAL int __must_check_result node_add_branch(MDBX_cursor *mc,
size_t indx,
const MDBX_val *key,
pgno_t pgno);
MDBX_INTERNAL int __must_check_result node_add_leaf(MDBX_cursor *mc,
size_t indx,
const MDBX_val *key,
MDBX_val *data,
unsigned flags);
MDBX_INTERNAL int __must_check_result node_add_dupfix(MDBX_cursor *mc,
size_t indx,
const MDBX_val *key);
MDBX_INTERNAL void node_del(MDBX_cursor *mc, size_t ksize);
MDBX_INTERNAL node_t *node_shrink(page_t *mp, size_t indx, node_t *node);

View File

@ -1,7 +1,10 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
/*******************************************************************************
*******************************************************************************
*******************************************************************************
*
* BUILD TIME
*
* #### ##### ##### # #### # # ####
* # # # # # # # # ## # #
@ -13,6 +16,10 @@
*
*/
#pragma once
#include "essentials.h"
/** \defgroup build_option Build options
* The libmdbx build options.
@{ */
@ -192,7 +199,11 @@
/** Avoid dependence from MSVC CRT and use ntdll.dll instead. */
#ifndef MDBX_WITHOUT_MSVC_CRT
#if !defined(MDBX_BUILD_CXX) || !MDBX_BUILD_CXX
#define MDBX_WITHOUT_MSVC_CRT 1
#else
#define MDBX_WITHOUT_MSVC_CRT 0
#endif
#elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1)
#error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1
#endif /* MDBX_WITHOUT_MSVC_CRT */
@ -499,6 +510,13 @@
#endif
#endif /* MDBX_CACHELINE_SIZE */
/* Max length of iov-vector passed to writev() call, used for auxilary writes */
#define MDBX_AUXILARY_IOV_MAX 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX
#undef MDBX_AUXILARY_IOV_MAX
#define MDBX_AUXILARY_IOV_MAX IOV_MAX
#endif /* MDBX_AUXILARY_IOV_MAX */
/** @} end of build options */
/*******************************************************************************
*******************************************************************************
@ -513,6 +531,9 @@
#else
#define MDBX_DEBUG 1
#endif
#endif
#if MDBX_DEBUG < 0 || MDBX_DEBUG > 2
#error "The MDBX_DEBUG must be defined to 0, 1 or 2"
#endif /* MDBX_DEBUG */
#else
@ -532,7 +553,7 @@
* Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`.
*
* \ingroup build_option */
#define MDBX_DEBUG 0...7
#define MDBX_DEBUG 0...2
/** Disables using of GNU libc extensions. */
#define MDBX_DISABLE_GNU_SOURCE 0 or 1

File diff suppressed because it is too large Load Diff

View File

@ -1,50 +1,11 @@
/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// https://en.wikipedia.org/wiki/Operating_system_abstraction_layer
#pragma once
/*----------------------------------------------------------------------------*/
/* C11 Atomics */
#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>)
#include <cstdatomic>
#define MDBX_HAVE_C11ATOMICS
#elif !defined(__cplusplus) && \
(__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \
!defined(__STDC_NO_ATOMICS__) && \
(__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \
!(defined(__GNUC__) || defined(__clang__)))
#include <stdatomic.h>
#define MDBX_HAVE_C11ATOMICS
#elif defined(__GNUC__) || defined(__clang__)
#elif defined(_MSC_VER)
#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
#pragma warning(disable : 4133) /* 'function': incompatible types - from \
'size_t' to 'LONGLONG' */
#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \
'std::size_t', possible loss of data */
#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \
'long', possible loss of data */
#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
#elif defined(__APPLE__)
#include <libkern/OSAtomic.h>
#else
#error FIXME atomic-ops
#endif
#include "essentials.h"
/*----------------------------------------------------------------------------*/
/* Memory/Compiler barriers, cache coherence */
@ -58,7 +19,7 @@
#include <sys/cachectl.h>
#endif
MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) {
MDBX_MAYBE_UNUSED static inline void osal_compiler_barrier(void) {
#if defined(__clang__) || defined(__GNUC__)
__asm__ __volatile__("" ::: "memory");
#elif defined(_MSC_VER)
@ -78,7 +39,7 @@ MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) {
#endif
}
MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) {
MDBX_MAYBE_UNUSED static inline void osal_memory_barrier(void) {
#ifdef MDBX_HAVE_C11ATOMICS
atomic_thread_fence(memory_order_seq_cst);
#elif defined(__ATOMIC_SEQ_CST)
@ -118,7 +79,7 @@ MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) {
#define HAVE_SYS_TYPES_H
typedef HANDLE osal_thread_t;
typedef unsigned osal_thread_key_t;
#define MAP_FAILED NULL
#define MAP_FAILED nullptr
#define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0))
#define THREAD_CALL WINAPI
#define THREAD_RESULT DWORD
@ -210,19 +171,6 @@ typedef pthread_mutex_t osal_fastmutex_t;
/*----------------------------------------------------------------------------*/
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2,
sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
osal_syspagesize(void) {
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
return sys_pagesize;
}
#if defined(_WIN32) || defined(_WIN64)
typedef wchar_t pathchar_t;
#define MDBX_PRIsPATH "ls"
@ -234,7 +182,7 @@ typedef char pathchar_t;
typedef struct osal_mmap {
union {
void *base;
struct MDBX_lockinfo *lck;
struct shared_lck *lck;
};
mdbx_filehandle_t fd;
size_t limit; /* mapping length, but NOT a size of file nor DB */
@ -245,25 +193,6 @@ typedef struct osal_mmap {
#endif
} osal_mmap_t;
typedef union bin128 {
__anonymous_struct_extension__ struct {
uint64_t x, y;
};
__anonymous_struct_extension__ struct {
uint32_t a, b, c, d;
};
} bin128_t;
#if defined(_WIN32) || defined(_WIN64)
typedef union osal_srwlock {
__anonymous_struct_extension__ struct {
long volatile readerCount;
long volatile writerCount;
};
RTL_SRWLOCK native;
} osal_srwlock_t;
#endif /* Windows */
#ifndef MDBX_HAVE_PWRITEV
#if defined(_WIN32) || defined(_WIN64)
@ -346,32 +275,30 @@ typedef struct osal_ioring {
char *boundary;
} osal_ioring_t;
#ifndef __cplusplus
/* Actually this is not ioring for now, but on the way. */
MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *
MDBX_INTERNAL int osal_ioring_create(osal_ioring_t *
#if defined(_WIN32) || defined(_WIN64)
,
bool enable_direct,
mdbx_filehandle_t overlapped_fd
,
bool enable_direct,
mdbx_filehandle_t overlapped_fd
#endif /* Windows */
);
MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
MDBX_INTERNAL int osal_ioring_resize(osal_ioring_t *, size_t items);
MDBX_INTERNAL void osal_ioring_destroy(osal_ioring_t *);
MDBX_INTERNAL void osal_ioring_reset(osal_ioring_t *);
MDBX_INTERNAL int osal_ioring_add(osal_ioring_t *ctx, const size_t offset,
void *data, const size_t bytes);
typedef struct osal_ioring_write_result {
int err;
unsigned wops;
} osal_ioring_write_result_t;
MDBX_INTERNAL_FUNC osal_ioring_write_result_t
MDBX_INTERNAL osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd);
typedef struct iov_ctx iov_ctx_t;
MDBX_INTERNAL_FUNC void osal_ioring_walk(
osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes));
MDBX_INTERNAL void osal_ioring_walk(osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx,
size_t offset, void *data,
size_t bytes));
MDBX_MAYBE_UNUSED static inline unsigned
osal_ioring_left(const osal_ioring_t *ior) {
@ -408,9 +335,9 @@ osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) {
#define osal_asprintf asprintf
#define osal_vasprintf vasprintf
#else
MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC
MDBX_MAYBE_UNUSED MDBX_INTERNAL
MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...);
MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap);
MDBX_INTERNAL int osal_vasprintf(char **strp, const char *fmt, va_list ap);
#endif
#if !defined(MADV_DODUMP) && defined(MADV_CORE)
@ -421,8 +348,7 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap);
#define MADV_DONTDUMP MADV_NOCORE
#endif /* MADV_NOCORE -> MADV_DONTDUMP */
MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void osal_jitter(bool tiny);
/* max bytes to write in one call */
#if defined(_WIN64)
@ -472,19 +398,13 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
#endif /* OFF_T_MAX */
#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */
#endif
#if defined(__linux__) || defined(__gnu_linux__)
MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version;
MDBX_INTERNAL_VAR_PROTO bool
mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
#endif /* Linux */
#endif /* !Windows */
#ifndef osal_strdup
LIBMDBX_API char *osal_strdup(const char *str);
#endif
MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) {
MDBX_MAYBE_UNUSED static inline int osal_get_errno(void) {
#if defined(_WIN32) || defined(_WIN64)
DWORD rc = GetLastError();
#else
@ -494,40 +414,39 @@ MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) {
}
#ifndef osal_memalign_alloc
MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes,
void **result);
MDBX_INTERNAL int osal_memalign_alloc(size_t alignment, size_t bytes,
void **result);
#endif
#ifndef osal_memalign_free
MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr);
MDBX_INTERNAL void osal_memalign_free(void *ptr);
#endif
MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair);
MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair);
MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair);
MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair,
bool part);
MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part);
MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair);
MDBX_INTERNAL int osal_condpair_init(osal_condpair_t *condpair);
MDBX_INTERNAL int osal_condpair_lock(osal_condpair_t *condpair);
MDBX_INTERNAL int osal_condpair_unlock(osal_condpair_t *condpair);
MDBX_INTERNAL int osal_condpair_signal(osal_condpair_t *condpair, bool part);
MDBX_INTERNAL int osal_condpair_wait(osal_condpair_t *condpair, bool part);
MDBX_INTERNAL int osal_condpair_destroy(osal_condpair_t *condpair);
MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL int osal_fastmutex_init(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
size_t count, uint64_t offset);
MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf,
size_t count);
MDBX_INTERNAL int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
size_t sgvcnt, uint64_t offset);
MDBX_INTERNAL int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
uint64_t offset);
MDBX_INTERNAL int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
size_t count, uint64_t offset);
MDBX_INTERNAL int osal_write(mdbx_filehandle_t fd, const void *buf,
size_t count);
MDBX_INTERNAL_FUNC int
MDBX_INTERNAL int
osal_thread_create(osal_thread_t *thread,
THREAD_RESULT(THREAD_CALL *start_routine)(void *),
void *arg);
MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread);
MDBX_INTERNAL int osal_thread_join(osal_thread_t thread);
enum osal_syncmode_bits {
MDBX_SYNC_NONE = 0,
@ -537,11 +456,11 @@ enum osal_syncmode_bits {
MDBX_SYNC_IODQ = 8
};
MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd,
const enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd,
const enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
enum osal_openfile_purpose {
MDBX_OPEN_DXB_READ,
@ -556,7 +475,7 @@ enum osal_openfile_purpose {
MDBX_OPEN_DELETE
};
MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) {
MDBX_MAYBE_UNUSED static inline bool osal_isdirsep(pathchar_t c) {
return
#if defined(_WIN32) || defined(_WIN64)
c == '\\' ||
@ -564,50 +483,45 @@ MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) {
c == '/';
}
MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r,
size_t len);
MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname,
size_t len);
MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname);
MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
const MDBX_env *env,
const pathchar_t *pathname,
mdbx_filehandle_t *fd,
mdbx_mode_t unix_mode_bits);
MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname);
MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname);
MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd);
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait);
MDBX_INTERNAL bool osal_pathequal(const pathchar_t *l, const pathchar_t *r,
size_t len);
MDBX_INTERNAL pathchar_t *osal_fileext(const pathchar_t *pathname, size_t len);
MDBX_INTERNAL int osal_fileexists(const pathchar_t *pathname);
MDBX_INTERNAL int osal_openfile(const enum osal_openfile_purpose purpose,
const MDBX_env *env, const pathchar_t *pathname,
mdbx_filehandle_t *fd,
mdbx_mode_t unix_mode_bits);
MDBX_INTERNAL int osal_closefile(mdbx_filehandle_t fd);
MDBX_INTERNAL int osal_removefile(const pathchar_t *pathname);
MDBX_INTERNAL int osal_removedirectory(const pathchar_t *pathname);
MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd);
MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait);
#define MMAP_OPTION_TRUNCATE 1
#define MMAP_OPTION_SEMAPHORE 2
MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size,
const size_t limit, const unsigned options);
MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map);
MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size,
const size_t limit, const unsigned options);
MDBX_INTERNAL int osal_munmap(osal_mmap_t *map);
#define MDBX_MRESIZE_MAY_MOVE 0x00000100
#define MDBX_MRESIZE_MAY_UNMAP 0x00000200
MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map,
size_t size, size_t limit);
MDBX_INTERNAL int osal_mresize(const int flags, osal_mmap_t *map, size_t size,
size_t limit);
#if defined(_WIN32) || defined(_WIN64)
typedef struct {
unsigned limit, count;
HANDLE handles[31];
} mdbx_handle_array_t;
MDBX_INTERNAL_FUNC int
MDBX_INTERNAL int
osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
MDBX_INTERNAL_FUNC int
osal_resume_threads_after_remap(mdbx_handle_array_t *array);
MDBX_INTERNAL int osal_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length,
enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
const pathchar_t *pathname,
int err);
MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle);
MDBX_INTERNAL int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length, enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL int osal_check_fs_rdonly(mdbx_filehandle_t handle,
const pathchar_t *pathname, int err);
MDBX_INTERNAL int osal_check_fs_incore(mdbx_filehandle_t handle);
MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) {
MDBX_MAYBE_UNUSED static inline uint32_t osal_getpid(void) {
STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t));
#if defined(_WIN32) || defined(_WIN64)
return GetCurrentProcessId();
@ -617,7 +531,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) {
#endif
}
MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) {
MDBX_MAYBE_UNUSED static inline uintptr_t osal_thread_self(void) {
mdbx_tid_t thunk;
STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk));
#if defined(_WIN32) || defined(_WIN64)
@ -630,22 +544,22 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) {
#if !defined(_WIN32) && !defined(_WIN64)
#if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC)
MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void);
MDBX_INTERNAL int osal_check_tid4bionic(void);
#else
static __inline int osal_check_tid4bionic(void) { return 0; }
static inline int osal_check_tid4bionic(void) { return 0; }
#endif /* __ANDROID_API__ || ANDROID) || BIONIC */
MDBX_MAYBE_UNUSED static __inline int
MDBX_MAYBE_UNUSED static inline int
osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
int err = osal_check_tid4bionic();
return unlikely(err) ? err : pthread_mutex_lock(mutex);
}
#endif /* !Windows */
MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_INTERNAL uint64_t osal_monotime(void);
MDBX_INTERNAL uint64_t osal_cputime(size_t *optional_page_faults);
MDBX_INTERNAL uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
MDBX_INTERNAL uint32_t osal_monotime_to_16dot16(uint64_t monotime);
MDBX_MAYBE_UNUSED static inline uint32_t
osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
@ -653,249 +567,18 @@ osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) {
return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0);
}
MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
/*----------------------------------------------------------------------------*/
/* lck stuff */
/// \brief Initialization of synchronization primitives linked with MDBX_env
/// instance both in LCK-file and within the current process.
/// \param
/// global_uniqueness_flag = true - denotes that there are no other processes
/// working with DB and LCK-file. Thus the function MUST initialize
/// shared synchronization objects in memory-mapped LCK-file.
/// global_uniqueness_flag = false - denotes that at least one process is
/// already working with DB and LCK-file, including the case when DB
/// has already been opened in the current process. Thus the function
/// MUST NOT initialize shared synchronization objects in memory-mapped
/// LCK-file that are already in use.
/// \return Error code or zero on success.
MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
MDBX_env *inprocess_neighbor,
int global_uniqueness_flag);
/// \brief Disconnects from shared interprocess objects and destructs
/// synchronization objects linked with MDBX_env instance
/// within the current process.
/// \param
/// inprocess_neighbor = NULL - if the current process does not have other
/// instances of MDBX_env linked with the DB being closed.
/// Thus the function MUST check for other processes working with DB or
/// LCK-file, and keep or destroy shared synchronization objects in
/// memory-mapped LCK-file depending on the result.
/// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env
/// (anyone of there is several) working with DB or LCK-file within the
/// current process. Thus the function MUST NOT try to acquire exclusive
/// lock and/or try to destruct shared synchronization objects linked with
/// DB or LCK-file. Moreover, the implementation MUST ensure correct work
/// of other instances of MDBX_env within the current process, e.g.
/// restore POSIX-fcntl locks after the closing of file descriptors.
/// \return Error code (MDBX_PANIC) or zero on success.
MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor,
const uint32_t current_pid);
/// \brief Connects to shared interprocess locking objects and tries to acquire
/// the maximum lock level (shared if exclusive is not available)
/// Depending on implementation or/and platform (Windows) this function may
/// acquire the non-OS super-level lock (e.g. for shared synchronization
/// objects initialization), which will be downgraded to OS-exclusive or
/// shared via explicit calling of osal_lck_downgrade().
/// \return
/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
/// the current process is the first and only after the last use of DB.
/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
/// DB has already been opened and now is used by other processes.
/// Otherwise (not 0 and not -1) - error code.
MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env);
/// \brief Downgrades the level of initially acquired lock to
/// operational level specified by argument. The reason for such downgrade:
/// - unblocking of other processes that are waiting for access, i.e.
/// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes
/// should be made aware that access is unavailable rather than
/// wait for it.
/// - freeing locks that interfere file operation (especially for Windows)
/// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
/// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
/// operational lock.
/// \return Error code or zero on success
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env);
MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env,
bool dont_wait);
/// \brief Locks LCK-file or/and table of readers for (de)registering.
/// \return Error code or zero on success
MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env);
/// \brief Unlocks LCK-file or/and table of readers after (de)registering.
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env);
/// \brief Acquires write-transaction lock.
/// \return Error code or zero on success
MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait);
/// \brief Releases write-transaction lock..
MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env);
/// \brief Sets alive-flag of reader presence (indicative lock) for PID of
/// the current process. The function does no more than needed for
/// the correct working of osal_rpid_check() in other processes.
/// \return Error code or zero on success
MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env);
/// \brief Resets alive-flag of reader presence (indicative lock)
/// for PID of the current process. The function does no more than needed
/// for the correct working of osal_rpid_check() in other processes.
/// \return Error code or zero on success
MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env);
/// \brief Checks for reading process status with the given pid with help of
/// alive-flag of presence (indicative lock) or using another way.
/// \return
/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
/// and working with DB (indicative lock is present).
/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
/// or not working with DB (indicative lock is not present).
/// Otherwise (not 0 and not -1) - error code.
MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
MDBX_INTERNAL void osal_ctor(void);
MDBX_INTERNAL void osal_dtor(void);
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst);
typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *);
MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init,
osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared,
osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive;
#if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */
typedef enum _FILE_INFO_BY_HANDLE_CLASS {
FileBasicInfo,
FileStandardInfo,
FileNameInfo,
FileRenameInfo,
FileDispositionInfo,
FileAllocationInfo,
FileEndOfFileInfo,
FileStreamInfo,
FileCompressionInfo,
FileAttributeTagInfo,
FileIdBothDirectoryInfo,
FileIdBothDirectoryRestartInfo,
FileIoPriorityHintInfo,
FileRemoteProtocolInfo,
MaximumFileInfoByHandleClass
} FILE_INFO_BY_HANDLE_CLASS,
*PFILE_INFO_BY_HANDLE_CLASS;
typedef struct _FILE_END_OF_FILE_INFO {
LARGE_INTEGER EndOfFile;
} FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO;
#define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001
#define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002
typedef struct _FILE_REMOTE_PROTOCOL_INFO {
USHORT StructureVersion;
USHORT StructureSize;
DWORD Protocol;
USHORT ProtocolMajorVersion;
USHORT ProtocolMinorVersion;
USHORT ProtocolRevision;
USHORT Reserved;
DWORD Flags;
struct {
DWORD Reserved[8];
} GenericReserved;
struct {
DWORD Reserved[16];
} ProtocolSpecificReserved;
} FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO;
#endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */
typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)(
_In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
_Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx
mdbx_GetFileInformationByHandleEx;
typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)(
_In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer,
_In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber,
_Out_opt_ LPDWORD lpMaximumComponentLength,
_Out_opt_ LPDWORD lpFileSystemFlags,
_Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize);
MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW
mdbx_GetVolumeInformationByHandleW;
typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile,
_Out_ LPWSTR lpszFilePath,
_In_ DWORD cchFilePath,
_In_ DWORD dwFlags);
MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW
mdbx_GetFinalPathNameByHandleW;
typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)(
_In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
_Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle
mdbx_SetFileInformationByHandle;
typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
IN HANDLE FileHandle, IN OUT HANDLE Event,
IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext,
OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode,
IN OUT PVOID InputBuffer, IN ULONG InputBufferLength,
OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength);
MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile;
typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void);
MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64;
#if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8
typedef struct _WIN32_MEMORY_RANGE_ENTRY {
PVOID VirtualAddress;
SIZE_T NumberOfBytes;
} WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY;
#endif /* Windows 8.x */
typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
HANDLE hProcess, ULONG_PTR NumberOfEntries,
PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT;
typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle,
IN PLARGE_INTEGER NewSectionSize);
MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection;
static __inline bool mdbx_RunningUnderWine(void) {
return !mdbx_NtExtendSection;
}
typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey,
LPCSTR lpValue, DWORD dwFlags,
LPDWORD pdwType, PVOID pvData,
LPDWORD pcbData);
MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA;
NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle,
PUCHAR OverlappedRangeStart,
ULONG Length);
MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange
mdbx_SetFileIoOverlappedRange;
MDBX_INTERNAL int osal_mb2w(const char *const src, wchar_t **const pdst);
#endif /* Windows */
#endif /* !__cplusplus */
/*----------------------------------------------------------------------------*/
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint64_t
osal_bswap64(uint64_t v) {
#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \
__has_builtin(__builtin_bswap64)
@ -916,7 +599,7 @@ osal_bswap64(uint64_t v) {
#endif
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t
osal_bswap32(uint32_t v) {
#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \
__has_builtin(__builtin_bswap32)
@ -932,33 +615,3 @@ osal_bswap32(uint32_t v) {
((v >> 8) & UINT32_C(0x0000ff00));
#endif
}
/*----------------------------------------------------------------------------*/
#if defined(_MSC_VER) && _MSC_VER >= 1900
/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
* for internal format-args checker. */
#undef PRIuPTR
#undef PRIiPTR
#undef PRIdPTR
#undef PRIxPTR
#define PRIuPTR "Iu"
#define PRIiPTR "Ii"
#define PRIdPTR "Id"
#define PRIxPTR "Ix"
#define PRIuSIZE "zu"
#define PRIiSIZE "zi"
#define PRIdSIZE "zd"
#define PRIxSIZE "zx"
#endif /* fix PRI*PTR for _MSC_VER */
#ifndef PRIuSIZE
#define PRIuSIZE PRIuPTR
#define PRIiSIZE PRIiPTR
#define PRIdSIZE PRIdPTR
#define PRIxSIZE PRIxPTR
#endif /* PRI*SIZE macros for MSVC */
#ifdef _MSC_VER
#pragma warning(pop)
#endif

579
src/page-get.c Normal file
View File

@ -0,0 +1,579 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
__cold int MDBX_PRINTF_ARGS(2, 3)
bad_page(const page_t *mp, const char *fmt, ...) {
if (LOG_ENABLED(MDBX_LOG_ERROR)) {
static const page_t *prev;
if (prev != mp) {
char buf4unknown[16];
prev = mp;
debug_log(MDBX_LOG_ERROR, "badpage", 0,
"corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n",
pagetype_caption(page_type(mp), buf4unknown), mp->pgno,
mp->txnid);
}
va_list args;
va_start(args, fmt);
debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args);
va_end(args);
}
return MDBX_CORRUPTED;
}
__cold void MDBX_PRINTF_ARGS(2, 3)
poor_page(const page_t *mp, const char *fmt, ...) {
if (LOG_ENABLED(MDBX_LOG_NOTICE)) {
static const page_t *prev;
if (prev != mp) {
char buf4unknown[16];
prev = mp;
debug_log(MDBX_LOG_NOTICE, "poorpage", 0,
"suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n",
pagetype_caption(page_type(mp), buf4unknown), mp->pgno,
mp->txnid);
}
va_list args;
va_start(args, fmt);
debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args);
va_end(args);
}
}
MDBX_CONST_FUNCTION static clc_t value_clc(const MDBX_cursor *mc) {
if (likely((mc->flags & z_inner) == 0))
return mc->clc->v;
else {
clc_t stub = {.cmp = cmp_equal_or_wrong, .lmin = 0, .lmax = 0};
return stub;
}
}
__cold int page_check(const MDBX_cursor *const mc, const page_t *const mp) {
DKBUF;
int rc = MDBX_SUCCESS;
if (unlikely(mp->pgno < MIN_PAGENO || mp->pgno > MAX_PAGENO))
rc = bad_page(mp, "invalid pgno (%u)\n", mp->pgno);
MDBX_env *const env = mc->txn->env;
const ptrdiff_t offset = ptr_dist(mp, env->dxb_mmap.base);
unsigned flags_mask = P_ILL_BITS;
unsigned flags_expected = 0;
if (offset < 0 ||
offset > (ptrdiff_t)(pgno2bytes(env, mc->txn->geo.first_unallocated) -
((mp->flags & P_SUBP) ? PAGEHDRSZ + 1 : env->ps))) {
/* should be dirty page without MDBX_WRITEMAP, or a subpage of. */
flags_mask -= P_SUBP;
if ((env->flags & MDBX_WRITEMAP) != 0 ||
(!is_shadowed(mc->txn, mp) && !(mp->flags & P_SUBP)))
rc = bad_page(mp, "invalid page-address %p, offset %zi\n",
__Wpedantic_format_voidptr(mp), offset);
} else if (offset & (env->ps - 1))
flags_expected = P_SUBP;
if (unlikely((mp->flags & flags_mask) != flags_expected))
rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n",
mp->flags & flags_mask, flags_expected);
cASSERT(mc, (mc->checking & z_dupfix) == 0 || (mc->flags & z_inner) != 0);
const uint8_t type = page_type(mp);
switch (type) {
default:
return bad_page(mp, "invalid type (%u)\n", type);
case P_LARGE:
if (unlikely(mc->flags & z_inner))
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large",
"nested dupsort tree", mc->tree->flags);
const pgno_t npages = mp->pages;
if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2))
rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages);
if (unlikely(mp->pgno + npages > mc->txn->geo.first_unallocated))
rc = bad_page(
mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n",
mp->pgno + npages, mc->txn->geo.first_unallocated);
return rc; //-------------------------- end of large/overflow page handling
case P_LEAF | P_SUBP:
if (unlikely(mc->tree->height != 1))
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n",
"leaf-sub", "nested dupsort db", mc->tree->flags);
/* fall through */
__fallthrough;
case P_LEAF:
if (unlikely((mc->checking & z_dupfix) != 0))
rc = bad_page(mp,
"unexpected leaf-page for dupfix subtree (db-lags 0x%x)\n",
mc->tree->flags);
break;
case P_LEAF | P_DUPFIX | P_SUBP:
if (unlikely(mc->tree->height != 1))
rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n",
"leaf2-sub", "nested dupsort db", mc->tree->flags);
/* fall through */
__fallthrough;
case P_LEAF | P_DUPFIX:
if (unlikely((mc->checking & z_dupfix) == 0))
rc = bad_page(
mp,
"unexpected leaf2-page for non-dupfix (sub)tree (db-flags 0x%x)\n",
mc->tree->flags);
break;
case P_BRANCH:
break;
}
if (unlikely(mp->upper < mp->lower || (mp->lower & 1) ||
PAGEHDRSZ + mp->upper > env->ps))
rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n",
mp->lower, mp->upper, page_space(env));
const char *const end_of_page = ptr_disp(mp, env->ps);
const size_t nkeys = page_numkeys(mp);
STATIC_ASSERT(P_BRANCH == 1);
if (unlikely(nkeys <= (uint8_t)(mp->flags & P_BRANCH))) {
if ((!(mc->flags & z_inner) || mc->tree->items) &&
(!(mc->checking & z_updating) ||
!(is_modifable(mc->txn, mp) || (mp->flags & P_SUBP))))
rc =
bad_page(mp, "%s-page nkeys (%zu) < %u\n",
is_branch(mp) ? "branch" : "leaf", nkeys, 1 + is_branch(mp));
}
const size_t ksize_max = keysize_max(env->ps, 0);
const size_t leaf2_ksize = mp->dupfix_ksize;
if (is_dupfix_leaf(mp)) {
if (unlikely((mc->flags & z_inner) == 0 ||
(mc->tree->flags & MDBX_DUPFIXED) == 0))
rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n",
mc->tree->flags);
else if (unlikely(leaf2_ksize != mc->tree->dupfix_size))
rc = bad_page(mp, "invalid leaf2_ksize %zu\n", leaf2_ksize);
else if (unlikely(((leaf2_ksize & nkeys) ^ mp->upper) & 1))
rc = bad_page(
mp, "invalid page upper (%u) for nkeys %zu with leaf2-length %zu\n",
mp->upper, nkeys, leaf2_ksize);
} else {
if (unlikely((mp->upper & 1) ||
PAGEHDRSZ + mp->upper + nkeys * sizeof(node_t) + nkeys - 1 >
env->ps))
rc =
bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n",
mp->upper, nkeys, page_space(env));
}
MDBX_val here, prev = {0, 0};
clc_t v_clc = value_clc(mc);
for (size_t i = 0; i < nkeys; ++i) {
if (is_dupfix_leaf(mp)) {
const char *const key = page_dupfix_ptr(mp, i, mc->tree->dupfix_size);
if (unlikely(end_of_page < key + leaf2_ksize)) {
rc = bad_page(mp, "leaf2-item beyond (%zu) page-end\n",
key + leaf2_ksize - end_of_page);
continue;
}
if (unlikely(leaf2_ksize != mc->clc->k.lmin)) {
if (unlikely(leaf2_ksize < mc->clc->k.lmin ||
leaf2_ksize > mc->clc->k.lmax))
rc = bad_page(mp,
"leaf2-item size (%zu) <> min/max length (%zu/%zu)\n",
leaf2_ksize, mc->clc->k.lmin, mc->clc->k.lmax);
else
mc->clc->k.lmin = mc->clc->k.lmax = leaf2_ksize;
}
if ((mc->checking & z_ignord) == 0) {
here.iov_base = (void *)key;
here.iov_len = leaf2_ksize;
if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0))
rc = bad_page(mp, "leaf2-item #%zu wrong order (%s >= %s)\n", i,
DKEY(&prev), DVAL(&here));
prev = here;
}
} else {
const node_t *const node = page_node(mp, i);
const char *const node_end = ptr_disp(node, NODESIZE);
if (unlikely(node_end > end_of_page)) {
rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i,
node_end - end_of_page);
continue;
}
const size_t ksize = node_ks(node);
if (unlikely(ksize > ksize_max))
rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize);
const char *const key = node_key(node);
if (unlikely(end_of_page < key + ksize)) {
rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i,
key + ksize - end_of_page);
continue;
}
if ((is_leaf(mp) || i > 0)) {
if (unlikely(ksize < mc->clc->k.lmin || ksize > mc->clc->k.lmax))
rc = bad_page(
mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n",
i, ksize, mc->clc->k.lmin, mc->clc->k.lmax);
if ((mc->checking & z_ignord) == 0) {
here.iov_base = (void *)key;
here.iov_len = ksize;
if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0))
rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i,
DKEY(&prev), DVAL(&here));
prev = here;
}
}
if (is_branch(mp)) {
if ((mc->checking & z_updating) == 0 && i == 0 && unlikely(ksize != 0))
rc = bad_page(mp, "branch-node[%zu] wrong 0-node key-length (%zu)\n",
i, ksize);
const pgno_t ref = node_pgno(node);
if (unlikely(ref < MIN_PAGENO) ||
(unlikely(ref >= mc->txn->geo.first_unallocated) &&
(unlikely(ref >= mc->txn->geo.now) ||
!(mc->checking & z_retiring))))
rc = bad_page(mp, "branch-node[%zu] wrong pgno (%u)\n", i, ref);
if (unlikely(node_flags(node)))
rc = bad_page(mp, "branch-node[%zu] wrong flags (%u)\n", i,
node_flags(node));
continue;
}
switch (node_flags(node)) {
default:
rc =
bad_page(mp, "invalid node[%zu] flags (%u)\n", i, node_flags(node));
break;
case N_BIGDATA /* data on large-page */:
case 0 /* usual */:
case N_SUBDATA /* sub-db */:
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
case N_DUPDATA /* short sub-page */:
break;
}
const size_t dsize = node_ds(node);
const char *const data = node_data(node);
if (node_flags(node) & N_BIGDATA) {
if (unlikely(end_of_page < data + sizeof(pgno_t))) {
rc = bad_page(
mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n",
"bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page);
continue;
}
if (unlikely(dsize <= v_clc.lmin || dsize > v_clc.lmax))
rc = bad_page(
mp,
"big-node data size (%zu) <> min/max value-length (%zu/%zu)\n",
dsize, v_clc.lmin, v_clc.lmax);
if (unlikely(node_size_len(node_ks(node), dsize) <=
mc->txn->env->leaf_nodemax) &&
mc->tree != &mc->txn->dbs[FREE_DBI])
poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize);
if ((mc->checking & z_retiring) == 0) {
const pgr_t lp =
page_get_large(mc, node_largedata_pgno(node), mp->txnid);
if (unlikely(lp.err != MDBX_SUCCESS))
return lp.err;
cASSERT(mc, page_type(lp.page) == P_LARGE);
const unsigned npages = largechunk_npages(env, dsize);
if (unlikely(lp.page->pages != npages)) {
if (lp.page->pages < npages)
rc = bad_page(lp.page,
"too less n-pages %u for bigdata-node (%zu bytes)",
lp.page->pages, dsize);
else if (mc->tree != &mc->txn->dbs[FREE_DBI])
poor_page(lp.page,
"extra n-pages %u for bigdata-node (%zu bytes)",
lp.page->pages, dsize);
}
}
continue;
}
if (unlikely(end_of_page < data + dsize)) {
rc = bad_page(mp,
"node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n",
"data", i, nkeys, dsize, data + dsize - end_of_page);
continue;
}
switch (node_flags(node)) {
default:
/* wrong, but already handled */
continue;
case 0 /* usual */:
if (unlikely(dsize < v_clc.lmin || dsize > v_clc.lmax)) {
rc = bad_page(
mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n",
dsize, v_clc.lmin, v_clc.lmax);
continue;
}
break;
case N_SUBDATA /* sub-db */:
if (unlikely(dsize != sizeof(tree_t))) {
rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize);
continue;
}
break;
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
if (unlikely(dsize != sizeof(tree_t))) {
rc = bad_page(mp, "invalid nested-db record size (%zu, expect %zu)\n",
dsize, sizeof(tree_t));
continue;
}
break;
case N_DUPDATA /* short sub-page */:
if (unlikely(dsize <= PAGEHDRSZ)) {
rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n",
dsize);
continue;
} else {
const page_t *const sp = (page_t *)data;
switch (sp->flags &
/* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) {
case P_LEAF | P_SUBP:
case P_LEAF | P_DUPFIX | P_SUBP:
break;
default:
rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n",
sp->flags);
continue;
}
const char *const end_of_subpage = data + dsize;
const intptr_t nsubkeys = page_numkeys(sp);
if (unlikely(nsubkeys == 0) && !(mc->checking & z_updating) &&
mc->tree->items)
rc = bad_page(mp, "no keys on a %s-page\n",
is_dupfix_leaf(sp) ? "leaf2-sub" : "leaf-sub");
MDBX_val sub_here, sub_prev = {0, 0};
for (int ii = 0; ii < nsubkeys; ii++) {
if (is_dupfix_leaf(sp)) {
/* DUPFIX pages have no entries[] or node headers */
const size_t sub_ksize = sp->dupfix_ksize;
const char *const sub_key =
page_dupfix_ptr(sp, ii, mc->tree->dupfix_size);
if (unlikely(end_of_subpage < sub_key + sub_ksize)) {
rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n",
sub_key + sub_ksize - end_of_subpage);
continue;
}
if (unlikely(sub_ksize != v_clc.lmin)) {
if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax))
rc = bad_page(mp,
"nested-leaf2-key size (%zu) <> min/max "
"value-length (%zu/%zu)\n",
sub_ksize, v_clc.lmin, v_clc.lmax);
else
v_clc.lmin = v_clc.lmax = sub_ksize;
}
if ((mc->checking & z_ignord) == 0) {
sub_here.iov_base = (void *)sub_key;
sub_here.iov_len = sub_ksize;
if (sub_prev.iov_base &&
unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0))
rc = bad_page(mp,
"nested-leaf2-key #%u wrong order (%s >= %s)\n",
ii, DKEY(&sub_prev), DVAL(&sub_here));
sub_prev = sub_here;
}
} else {
const node_t *const sub_node = page_node(sp, ii);
const char *const sub_node_end = ptr_disp(sub_node, NODESIZE);
if (unlikely(sub_node_end > end_of_subpage)) {
rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n",
end_of_subpage - sub_node_end);
continue;
}
if (unlikely(node_flags(sub_node) != 0))
rc = bad_page(mp, "nested-node invalid flags (%u)\n",
node_flags(sub_node));
const size_t sub_ksize = node_ks(sub_node);
const char *const sub_key = node_key(sub_node);
const size_t sub_dsize = node_ds(sub_node);
/* char *sub_data = node_data(sub_node); */
if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax))
rc = bad_page(mp,
"nested-node-key size (%zu) <> min/max "
"value-length (%zu/%zu)\n",
sub_ksize, v_clc.lmin, v_clc.lmax);
if ((mc->checking & z_ignord) == 0) {
sub_here.iov_base = (void *)sub_key;
sub_here.iov_len = sub_ksize;
if (sub_prev.iov_base &&
unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0))
rc = bad_page(mp,
"nested-node-key #%u wrong order (%s >= %s)\n",
ii, DKEY(&sub_prev), DVAL(&sub_here));
sub_prev = sub_here;
}
if (unlikely(sub_dsize != 0))
rc = bad_page(mp, "nested-node non-empty data size (%zu)\n",
sub_dsize);
if (unlikely(end_of_subpage < sub_key + sub_ksize))
rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n",
sub_key + sub_ksize - end_of_subpage);
}
}
}
break;
}
}
}
return rc;
}
static __always_inline int check_page_header(const uint16_t ILL,
const page_t *page,
MDBX_txn *const txn,
const txnid_t front) {
if (unlikely(page->flags & ILL)) {
if (ILL == P_ILL_BITS || (page->flags & P_ILL_BITS))
return bad_page(page, "invalid page's flags (%u)\n", page->flags);
else if (ILL & P_LARGE) {
assert((ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0);
assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX));
return bad_page(page, "unexpected %s instead of %s (%u)\n",
"large/overflow", "branch/leaf/leaf2", page->flags);
} else if (ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) {
assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_DUPFIX));
assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX));
return bad_page(page, "unexpected %s instead of %s (%u)\n",
"branch/leaf/leaf2", "large/overflow", page->flags);
} else {
assert(false);
}
}
if (unlikely(page->txnid > front) &&
unlikely(page->txnid > txn->front_txnid || front < txn->txnid))
return bad_page(
page,
"invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n",
page->txnid,
(front == txn->front_txnid && front != txn->txnid) ? "front-txn"
: "parent-page",
front);
if (((ILL & P_LARGE) || !is_largepage(page)) &&
(ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0) {
/* Контроль четности page->upper тут либо приводит к ложным ошибкам,
* либо слишком дорог по количеству операций. Заковырка в том, что upper
* может быть нечетным на DUPFIX-страницах, при нечетном количестве
* элементов нечетной длины. Поэтому четность page->upper здесь не
* проверяется, но соответствующие полные проверки есть в page_check(). */
if (unlikely(page->upper < page->lower || (page->lower & 1) ||
PAGEHDRSZ + page->upper > txn->env->ps))
return bad_page(page,
"invalid page' lower(%u)/upper(%u) with limit %zu\n",
page->lower, page->upper, page_space(txn->env));
} else if ((ILL & P_LARGE) == 0) {
const pgno_t npages = page->pages;
if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2))
return bad_page(page, "invalid n-pages (%u) for large-page\n", npages);
if (unlikely(page->pgno + npages > txn->geo.first_unallocated))
return bad_page(
page,
"end of large-page beyond (%u) allocated space (%u next-pgno)\n",
page->pgno + npages, txn->geo.first_unallocated);
} else {
assert(false);
}
return MDBX_SUCCESS;
}
__cold static __noinline pgr_t check_page_complete(const uint16_t ILL,
page_t *page,
const MDBX_cursor *const mc,
const txnid_t front) {
pgr_t r = {page, check_page_header(ILL, page, mc->txn, front)};
if (likely(r.err == MDBX_SUCCESS))
r.err = page_check(mc, page);
if (unlikely(r.err != MDBX_SUCCESS))
mc->txn->flags |= MDBX_TXN_ERROR;
return r;
}
static __always_inline pgr_t page_get_inline(const uint16_t ILL,
const MDBX_cursor *const mc,
const pgno_t pgno,
const txnid_t front) {
MDBX_txn *const txn = mc->txn;
tASSERT(txn, front <= txn->front_txnid);
pgr_t r;
if (unlikely(pgno >= txn->geo.first_unallocated)) {
ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno);
r.page = nullptr;
r.err = MDBX_PAGE_NOTFOUND;
bailout:
txn->flags |= MDBX_TXN_ERROR;
return r;
}
eASSERT(txn->env, ((txn->flags ^ txn->env->flags) & MDBX_WRITEMAP) == 0);
r.page = pgno2page(txn->env, pgno);
if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) {
const MDBX_txn *spiller = txn;
do {
/* Spilled pages were dirtied in this txn and flushed
* because the dirty list got full. Bring this page
* back in from the map (but don't unspill it here,
* leave that unless page_touch happens again). */
if (unlikely(spiller->flags & MDBX_TXN_SPILLS) &&
spill_search(spiller, pgno))
break;
const size_t i = dpl_search(spiller, pgno);
tASSERT(txn, (intptr_t)i > 0);
if (spiller->tw.dirtylist->items[i].pgno == pgno) {
r.page = spiller->tw.dirtylist->items[i].ptr;
break;
}
spiller = spiller->parent;
} while (unlikely(spiller));
}
if (unlikely(r.page->pgno != pgno)) {
r.err = bad_page(
r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n",
r.page->pgno, pgno);
goto bailout;
}
if (unlikely(mc->checking & z_pagecheck))
return check_page_complete(ILL, r.page, mc, front);
#if MDBX_DISABLE_VALIDATION
r.err = MDBX_SUCCESS;
#else
r.err = check_page_header(ILL, r.page, txn, front);
if (unlikely(r.err != MDBX_SUCCESS))
goto bailout;
#endif /* MDBX_DISABLE_VALIDATION */
return r;
}
pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno,
const txnid_t front) {
return page_get_inline(P_ILL_BITS, mc, pgno, front);
}
__hot pgr_t page_get_three(const MDBX_cursor *const mc, const pgno_t pgno,
const txnid_t front) {
return page_get_inline(P_ILL_BITS | P_LARGE, mc, pgno, front);
}
pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno,
const txnid_t front) {
return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_DUPFIX, mc, pgno,
front);
}

198
src/page-iov.c Normal file
View File

@ -0,0 +1,198 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages,
mdbx_filehandle_t fd, bool check_coherence) {
ctx->env = txn->env;
ctx->ior = &txn->env->ioring;
ctx->fd = fd;
ctx->coherency_timestamp =
(check_coherence || txn->env->lck->pgops.incoherence.weak)
? 0
: UINT64_MAX /* не выполнять сверку */;
ctx->err = osal_ioring_prepare(ctx->ior, items,
pgno_align2os_bytes(txn->env, npages));
if (likely(ctx->err == MDBX_SUCCESS)) {
#if MDBX_NEED_WRITTEN_RANGE
ctx->flush_begin = MAX_PAGENO;
ctx->flush_end = MIN_PAGENO;
#endif /* MDBX_NEED_WRITTEN_RANGE */
osal_ioring_reset(ctx->ior);
}
return ctx->err;
}
static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data,
size_t bytes) {
MDBX_env *const env = ctx->env;
eASSERT(env, (env->flags & MDBX_WRITEMAP) == 0);
page_t *wp = (page_t *)data;
eASSERT(env, wp->pgno == bytes2pgno(env, offset));
eASSERT(env, bytes2pgno(env, bytes) >= (is_largepage(wp) ? wp->pages : 1u));
eASSERT(env, (wp->flags & P_ILL_BITS) == 0);
if (likely(ctx->err == MDBX_SUCCESS)) {
const page_t *const rp = ptr_disp(env->dxb_mmap.base, offset);
VALGRIND_MAKE_MEM_DEFINED(rp, bytes);
MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes);
osal_flush_incoherent_mmap(rp, bytes, globals.sys_pagesize);
/* check with timeout as the workaround
* for https://libmdbx.dqdkfa.ru/dead-github/issues/269
*
* Проблема проявляется только при неупорядоченности: если записанная
* последней мета-страница "обгоняет" ранее записанные, т.е. когда
* записанное в файл позже становится видимым в отображении раньше,
* чем записанное ранее.
*
* Исходно здесь всегда выполнялась полная сверка. Это давало полную
* гарантию защиты от проявления проблемы, но порождало накладные расходы.
* В некоторых сценариях наблюдалось снижение производительности до 10-15%,
* а в синтетических тестах до 30%. Конечно никто не вникал в причины,
* а просто останавливался на мнении "libmdbx не быстрее LMDB",
* например: https://clck.ru/3386er
*
* Поэтому после серии экспериментов и тестов реализовано следующее:
* 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1
* можно включить полную сверку после записи.
* Остальные пункты являются взвешенным компромиссом между полной
* гарантией обнаружения проблемы и бесполезными затратами на системах
* без этого недостатка.
* 1. При старте транзакций проверяется соответствие выбранной мета-страницы
* корневым страницам b-tree проверяется. Эта проверка показала себя
* достаточной без сверки после записи. При обнаружении "некогерентности"
* эти случаи подсчитываются, а при их ненулевом счетчике выполняется
* полная сверка. Таким образом, произойдет переключение в режим полной
* сверки, если показавшая себя достаточной проверка заметит проявление
* проблемы хоты-бы раз.
* 2. Сверка не выполняется при фиксации транзакции, так как:
* - при наличии проблемы "не-когерентности" (при отложенном копировании
* или обновлении PTE, после возврата из write-syscall), проверка
* в этом процессе не гарантирует актуальность данных в другом
* процессе, который может запустить транзакцию сразу после коммита;
* - сверка только последнего блока позволяет почти восстановить
* производительность в больших транзакциях, но одновременно размывает
* уверенность в отсутствии сбоев, чем обесценивает всю затею;
* - после записи данных будет записана мета-страница, соответствие
* которой корневым страницам b-tree проверяется при старте
* транзакций, и только эта проверка показала себя достаточной;
* 3. При спиллинге производится полная сверка записанных страниц. Тут был
* соблазн сверять не полностью, а например начало и конец каждого блока.
* Но при спиллинге возможна ситуация повторного вытеснения страниц, в
* том числе large/overflow. При этом возникает риск прочитать в текущей
* транзакции старую версию страницы, до повторной записи. В этом случае
* могут возникать крайне редкие невоспроизводимые ошибки. С учетом того
* что спиллинг выполняет крайне редко, решено отказаться от экономии
* в пользу надежности. */
#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY
#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0
#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */
if ((MDBX_FORCE_CHECK_MMAP_COHERENCY ||
ctx->coherency_timestamp != UINT64_MAX) &&
unlikely(memcmp(wp, rp, bytes))) {
ctx->coherency_timestamp = 0;
env->lck->pgops.incoherence.weak =
(env->lck->pgops.incoherence.weak >= INT32_MAX)
? INT32_MAX
: env->lck->pgops.incoherence.weak + 1;
WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->pgno,
"(workaround for incoherent flaw of unified page/buffer cache)");
do
if (coherency_timeout(&ctx->coherency_timestamp, wp->pgno, env) !=
MDBX_RESULT_TRUE) {
ctx->err = MDBX_PROBLEM;
break;
}
while (unlikely(memcmp(wp, rp, bytes)));
}
}
if (likely(bytes == env->ps))
page_shadow_release(env, wp, 1);
else {
do {
eASSERT(env, wp->pgno == bytes2pgno(env, offset));
eASSERT(env, (wp->flags & P_ILL_BITS) == 0);
size_t npages = is_largepage(wp) ? wp->pages : 1u;
size_t chunk = pgno2bytes(env, npages);
eASSERT(env, bytes >= chunk);
page_t *next = ptr_disp(wp, chunk);
page_shadow_release(env, wp, npages);
wp = next;
offset += chunk;
bytes -= chunk;
} while (bytes);
}
}
static void iov_complete(iov_ctx_t *ctx) {
if ((ctx->env->flags & MDBX_WRITEMAP) == 0)
osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages);
osal_ioring_reset(ctx->ior);
}
int iov_write(iov_ctx_t *ctx) {
eASSERT(ctx->env, !iov_empty(ctx));
osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd);
#if MDBX_ENABLE_PGOP_STAT
ctx->env->lck->pgops.wops.weak += r.wops;
#endif /* MDBX_ENABLE_PGOP_STAT */
ctx->err = r.err;
if (unlikely(ctx->err != MDBX_SUCCESS))
ERROR("Write error: %s", mdbx_strerror(ctx->err));
iov_complete(ctx);
return ctx->err;
}
int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, size_t npages) {
MDBX_env *const env = txn->env;
tASSERT(txn, ctx->err == MDBX_SUCCESS);
tASSERT(txn, dp->pgno >= MIN_PAGENO && dp->pgno < txn->geo.first_unallocated);
tASSERT(txn, is_modifable(txn, dp));
tASSERT(txn, !(dp->flags & ~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE)));
if (is_shadowed(txn, dp)) {
tASSERT(txn, !(txn->flags & MDBX_WRITEMAP));
dp->txnid = txn->txnid;
tASSERT(txn, is_spilled(txn, dp));
#if MDBX_AVOID_MSYNC
doit:;
#endif /* MDBX_AVOID_MSYNC */
int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp,
pgno2bytes(env, npages));
if (unlikely(err != MDBX_SUCCESS)) {
ctx->err = err;
if (unlikely(err != MDBX_RESULT_TRUE)) {
iov_complete(ctx);
return err;
}
err = iov_write(ctx);
tASSERT(txn, iov_empty(ctx));
if (likely(err == MDBX_SUCCESS)) {
err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp,
pgno2bytes(env, npages));
if (unlikely(err != MDBX_SUCCESS)) {
iov_complete(ctx);
return ctx->err = err;
}
}
tASSERT(txn, ctx->err == MDBX_SUCCESS);
}
} else {
tASSERT(txn, txn->flags & MDBX_WRITEMAP);
#if MDBX_AVOID_MSYNC
goto doit;
#endif /* MDBX_AVOID_MSYNC */
}
#if MDBX_NEED_WRITTEN_RANGE
ctx->flush_begin =
(ctx->flush_begin < dp->pgno) ? ctx->flush_begin : dp->pgno;
ctx->flush_end = (ctx->flush_end > dp->pgno + (pgno_t)npages)
? ctx->flush_end
: dp->pgno + (pgno_t)npages;
#endif /* MDBX_NEED_WRITTEN_RANGE */
return MDBX_SUCCESS;
}

38
src/page-iov.h Normal file
View File

@ -0,0 +1,38 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
#if !(defined(_WIN32) || defined(_WIN64))
#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2
#endif
struct iov_ctx {
MDBX_env *env;
osal_ioring_t *ior;
mdbx_filehandle_t fd;
int err;
#ifndef MDBX_NEED_WRITTEN_RANGE
#define MDBX_NEED_WRITTEN_RANGE 1
#endif /* MDBX_NEED_WRITTEN_RANGE */
#if MDBX_NEED_WRITTEN_RANGE
pgno_t flush_begin;
pgno_t flush_end;
#endif /* MDBX_NEED_WRITTEN_RANGE */
uint64_t coherency_timestamp;
};
MDBX_INTERNAL __must_check_result int
iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages,
mdbx_filehandle_t fd, bool check_coherence);
static inline bool iov_empty(const iov_ctx_t *ctx) {
return osal_ioring_used(ctx->ior) == 0;
}
MDBX_INTERNAL __must_check_result int iov_page(MDBX_txn *txn, iov_ctx_t *ctx,
page_t *dp, size_t npages);
MDBX_INTERNAL __must_check_result int iov_write(iov_ctx_t *ctx);

772
src/page-ops.c Normal file
View File

@ -0,0 +1,772 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
static inline tree_t *outer_tree(MDBX_cursor *mc) {
cASSERT(mc, (mc->flags & z_inner) != 0);
subcur_t *mx = container_of(mc->tree, subcur_t, nested_tree);
cursor_couple_t *couple = container_of(mx, cursor_couple_t, inner);
cASSERT(mc, mc->tree == &couple->outer.subcur->nested_tree);
cASSERT(mc, &mc->clc->k == &couple->outer.clc->v);
return couple->outer.tree;
}
pgr_t page_new(MDBX_cursor *mc, const unsigned flags) {
cASSERT(mc, (flags & P_LARGE) == 0);
pgr_t ret = gc_alloc_single(mc);
if (unlikely(ret.err != MDBX_SUCCESS))
return ret;
DEBUG("db %zu allocated new page %" PRIaPGNO, cursor_dbi(mc), ret.page->pgno);
ret.page->flags = (uint16_t)flags;
cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY);
cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY);
#if MDBX_ENABLE_PGOP_STAT
mc->txn->env->lck->pgops.newly.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
STATIC_ASSERT(P_BRANCH == 1);
const unsigned is_branch = flags & P_BRANCH;
ret.page->lower = 0;
ret.page->upper = (indx_t)(mc->txn->env->ps - PAGEHDRSZ);
mc->tree->branch_pages += is_branch;
mc->tree->leaf_pages += 1 - is_branch;
if (unlikely(mc->flags & z_inner)) {
tree_t *outer = outer_tree(mc);
outer->branch_pages += is_branch;
outer->leaf_pages += 1 - is_branch;
}
return ret;
}
pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) {
pgr_t ret = likely(npages == 1) ? gc_alloc_single(mc)
: gc_alloc_ex(mc, npages, ALLOC_DEFAULT);
if (unlikely(ret.err != MDBX_SUCCESS))
return ret;
DEBUG("dbi %zu allocated new large-page %" PRIaPGNO ", num %zu",
cursor_dbi(mc), ret.page->pgno, npages);
ret.page->flags = P_LARGE;
cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY);
cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY);
#if MDBX_ENABLE_PGOP_STAT
mc->txn->env->lck->pgops.newly.weak += npages;
#endif /* MDBX_ENABLE_PGOP_STAT */
mc->tree->large_pages += (pgno_t)npages;
ret.page->pages = (pgno_t)npages;
cASSERT(mc, !(mc->flags & z_inner));
return ret;
}
__hot void page_copy(page_t *const dst, const page_t *const src,
const size_t size) {
STATIC_ASSERT(UINT16_MAX > MDBX_MAX_PAGESIZE - PAGEHDRSZ);
STATIC_ASSERT(MDBX_MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4);
void *copy_dst = dst;
const void *copy_src = src;
size_t copy_len = size;
if (src->flags & P_DUPFIX) {
copy_len = PAGEHDRSZ + src->dupfix_ksize * page_numkeys(src);
if (unlikely(copy_len > size))
goto bailout;
} else if ((src->flags & P_LARGE) == 0) {
size_t upper = src->upper, lower = src->lower;
intptr_t unused = upper - lower;
/* If page isn't full, just copy the used portion. Adjust
* alignment so memcpy may copy words instead of bytes. */
if (unused > MDBX_CACHELINE_SIZE * 3) {
lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *));
upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *));
if (unlikely(upper > copy_len))
goto bailout;
memcpy(copy_dst, copy_src, lower);
copy_dst = ptr_disp(copy_dst, upper);
copy_src = ptr_disp(copy_src, upper);
copy_len -= upper;
}
}
memcpy(copy_dst, copy_src, copy_len);
return;
bailout:
if (src->flags & P_DUPFIX)
bad_page(src, "%s addr %p, n-keys %zu, ksize %u",
"invalid/corrupted source page", __Wpedantic_format_voidptr(src),
page_numkeys(src), src->dupfix_ksize);
else
bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page",
__Wpedantic_format_voidptr(src), src->upper);
memset(dst, -1, size);
}
__cold pgr_t __must_check_result page_unspill(MDBX_txn *const txn,
const page_t *const mp) {
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0);
tASSERT(txn, is_spilled(txn, mp));
const MDBX_txn *scan = txn;
pgr_t ret;
do {
tASSERT(txn, (scan->flags & MDBX_TXN_SPILLS) != 0);
const size_t si = spill_search(scan, mp->pgno);
if (!si)
continue;
const unsigned npages = is_largepage(mp) ? mp->pages : 1;
ret.page = page_shadow_alloc(txn, npages);
if (unlikely(!ret.page)) {
ret.err = MDBX_ENOMEM;
return ret;
}
page_copy(ret.page, mp, pgno2bytes(txn->env, npages));
if (scan == txn) {
/* If in current txn, this page is no longer spilled.
* If it happens to be the last page, truncate the spill list.
* Otherwise mark it as deleted by setting the LSB. */
spill_remove(txn, si, npages);
} /* otherwise, if belonging to a parent txn, the
* page remains spilled until child commits */
ret.err = page_dirty(txn, ret.page, npages);
if (unlikely(ret.err != MDBX_SUCCESS))
return ret;
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.unspill.weak += npages;
#endif /* MDBX_ENABLE_PGOP_STAT */
ret.page->flags |= (scan == txn) ? 0 : P_SPILLED;
ret.err = MDBX_SUCCESS;
return ret;
} while (likely((scan = scan->parent) != nullptr &&
(scan->flags & MDBX_TXN_SPILLS) != 0));
ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN
" not found in the spill-list(s), current txn %" PRIaTXN
" front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN,
mp->pgno, mp->txnid, txn->txnid, txn->front_txnid,
txn->env->basal_txn->txnid, txn->env->basal_txn->front_txnid);
ret.err = MDBX_PROBLEM;
ret.page = nullptr;
return ret;
}
__hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) {
tASSERT(txn, is_modifable(txn, mp) && txn->tw.dirtylist);
tASSERT(txn, !is_largepage(mp) && !is_subpage(mp));
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const size_t n = dpl_search(txn, mp->pgno);
if (MDBX_AVOID_MSYNC &&
unlikely(txn->tw.dirtylist->items[n].pgno != mp->pgno)) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP));
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1);
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.unspill.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
return page_dirty(txn, (page_t *)mp, 1);
}
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length);
tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->pgno &&
txn->tw.dirtylist->items[n].ptr == mp);
if (!MDBX_AVOID_MSYNC || (txn->flags & MDBX_WRITEMAP) == 0) {
size_t *const ptr =
ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
}
return MDBX_SUCCESS;
}
__hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc,
const page_t *const mp) {
tASSERT(txn, !is_modifable(txn, mp) && !is_largepage(mp));
if (is_subpage(mp)) {
((page_t *)mp)->txnid = txn->front_txnid;
return MDBX_SUCCESS;
}
int rc;
page_t *np;
if (is_frozen(txn, mp)) {
/* CoW the page */
rc = pnl_need(&txn->tw.retired_pages, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
const pgr_t par = gc_alloc_single(mc);
rc = par.err;
np = par.page;
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
const pgno_t pgno = np->pgno;
DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, cursor_dbi_dbg(mc),
mp->pgno, pgno);
tASSERT(txn, mp->pgno != pgno);
pnl_append_prereserved(txn->tw.retired_pages, mp->pgno);
/* Update the parent page, if any, to point to the new page */
if (likely(mc->top)) {
page_t *parent = mc->pg[mc->top - 1];
node_t *node = page_node(parent, mc->ki[mc->top - 1]);
node_set_pgno(node, pgno);
} else {
mc->tree->root = pgno;
}
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.cow.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
page_copy(np, mp, txn->env->ps);
np->pgno = pgno;
np->txnid = txn->front_txnid;
} else if (is_spilled(txn, mp)) {
pgr_t pur = page_unspill(txn, mp);
np = pur.page;
rc = pur.err;
if (likely(rc == MDBX_SUCCESS)) {
tASSERT(txn, np != nullptr);
goto done;
}
goto fail;
} else {
if (unlikely(!txn->parent)) {
ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s "
"page %" PRIaPGNO " mod-txnid %" PRIaTXN ","
" without parent transaction, current txn %" PRIaTXN
" front %" PRIaTXN,
is_branch(mp) ? "branch" : "leaf", mp->pgno, mp->txnid,
mc->txn->txnid, mc->txn->front_txnid);
rc = MDBX_PROBLEM;
goto fail;
}
DEBUG("clone db %d page %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno);
tASSERT(txn,
txn->tw.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
/* No - copy it */
np = page_shadow_alloc(txn, 1);
if (unlikely(!np)) {
rc = MDBX_ENOMEM;
goto fail;
}
page_copy(np, mp, txn->env->ps);
/* insert a clone of parent's dirty page, so don't touch dirtyroom */
rc = page_dirty(txn, np, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.clone.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
done:
/* Adjust cursors pointing to mp */
mc->pg[mc->top] = np;
MDBX_cursor *m2 = txn->cursors[cursor_dbi(mc)];
if (mc->flags & z_inner) {
for (; m2; m2 = m2->next) {
MDBX_cursor *m3 = &m2->subcur->cursor;
if (m3->top < mc->top)
continue;
if (m3->pg[mc->top] == mp)
m3->pg[mc->top] = np;
}
} else {
for (; m2; m2 = m2->next) {
if (m2->top < mc->top)
continue;
if (m2->pg[mc->top] == mp) {
m2->pg[mc->top] = np;
if (is_leaf(np) && inner_pointed(m2))
cursor_inner_refresh(m2, np, m2->ki[mc->top]);
}
}
}
return MDBX_SUCCESS;
fail:
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
page_t *page_shadow_alloc(MDBX_txn *txn, size_t num) {
MDBX_env *env = txn->env;
page_t *np = env->shadow_reserve;
size_t size = env->ps;
if (likely(num == 1 && np)) {
eASSERT(env, env->shadow_reserve_len > 0);
MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size);
VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)),
size + sizeof(size_t));
VALGRIND_MAKE_MEM_DEFINED(&page_next(np), sizeof(page_t *));
env->shadow_reserve = page_next(np);
env->shadow_reserve_len -= 1;
} else {
size = pgno2bytes(env, num);
void *const ptr = osal_malloc(size + sizeof(size_t));
if (unlikely(!ptr)) {
txn->flags |= MDBX_TXN_ERROR;
return nullptr;
}
VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t));
np = ptr_disp(ptr, sizeof(size_t));
}
if ((env->flags & MDBX_NOMEMINIT) == 0) {
/* For a single page alloc, we init everything after the page header.
* For multi-page, we init the final page; if the caller needed that
* many pages they will be filling in at least up to the last page. */
size_t skip = PAGEHDRSZ;
if (num > 1)
skip += pgno2bytes(env, num - 1);
memset(ptr_disp(np, skip), 0, size - skip);
}
#if MDBX_DEBUG
np->pgno = 0;
#endif
VALGRIND_MAKE_MEM_UNDEFINED(np, size);
np->flags = 0;
np->pages = (pgno_t)num;
return np;
}
void page_shadow_release(MDBX_env *env, page_t *dp, size_t npages) {
VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages));
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages));
if (unlikely(env->flags & MDBX_PAGEPERTURB))
memset(dp, -1, pgno2bytes(env, npages));
if (likely(npages == 1 &&
env->shadow_reserve_len < env->options.dp_reserve_limit)) {
MDBX_ASAN_POISON_MEMORY_REGION(dp, env->ps);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *));
page_next(dp) = env->shadow_reserve;
VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)));
env->shadow_reserve = dp;
env->shadow_reserve_len += 1;
} else {
/* large pages just get freed directly */
void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
VALGRIND_MEMPOOL_FREE(env, ptr);
osal_free(ptr);
}
}
__cold static void page_kill(MDBX_txn *txn, page_t *mp, pgno_t pgno,
size_t npages) {
MDBX_env *const env = txn->env;
DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno);
eASSERT(env, pgno >= NUM_METAS && npages);
if (!is_frozen(txn, mp)) {
const size_t bytes = pgno2bytes(env, npages);
memset(mp, -1, bytes);
mp->pgno = pgno;
if ((txn->flags & MDBX_WRITEMAP) == 0)
osal_pwrite(env->lazy_fd, mp, bytes, pgno2bytes(env, pgno));
} else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
iov[0].iov_len = env->ps;
iov[0].iov_base = ptr_disp(env->page_auxbuf, env->ps);
size_t iov_off = pgno2bytes(env, pgno), n = 1;
while (--npages) {
iov[n] = iov[0];
if (++n == MDBX_AUXILARY_IOV_MAX) {
osal_pwritev(env->lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, iov_off);
iov_off += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
n = 0;
}
}
osal_pwritev(env->lazy_fd, iov, n, iov_off);
}
}
static inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) {
/* TODO:
* 1) при включенной "экономии последовательностей" проверить, что
* страница не примыкает к какой-либо из уже находящийся в reclaimed.
* 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать
половину в reclaimed. */
return txn->tw.loose_count < txn->env->options.dp_loose_limit &&
(!MDBX_ENABLE_REFUND ||
/* skip pages near to the end in favor of compactification */
txn->geo.first_unallocated >
pgno + txn->env->options.dp_loose_limit ||
txn->geo.first_unallocated <= txn->env->options.dp_loose_limit);
}
/* Retire, loosen or free a single page.
*
* For dirty pages, saves single pages to a list for future reuse in this same
* txn. It has been pulled from the GC and already resides on the dirty list,
* but has been deleted. Use these pages first before pulling again from the GC.
*
* If the page wasn't dirtied in this txn, just add it
* to this txn's free list. */
int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno,
page_t *mp /* maybe null */,
unsigned pageflags /* maybe unknown/zero */) {
int rc;
MDBX_txn *const txn = mc->txn;
tASSERT(txn, !mp || (mp->pgno == pgno && mp->flags == pageflags));
/* During deleting entire subtrees, it is reasonable and possible to avoid
* reading leaf pages, i.e. significantly reduce hard page-faults & IOPs:
* - mp is null, i.e. the page has not yet been read;
* - pagetype is known and the P_LEAF bit is set;
* - we can determine the page status via scanning the lists
* of dirty and spilled pages.
*
* On the other hand, this could be suboptimal for WRITEMAP mode, since
* requires support the list of dirty pages and avoid explicit spilling.
* So for flexibility and avoid extra internal dependencies we just
* fallback to reading if dirty list was not allocated yet. */
size_t di = 0, si = 0, npages = 1;
enum page_status {
unknown,
frozen,
spilled,
shadowed,
modifable
} status = unknown;
if (unlikely(!mp)) {
if (ASSERT_ENABLED() && pageflags) {
pgr_t check;
check = page_get_any(mc, pgno, txn->front_txnid);
if (unlikely(check.err != MDBX_SUCCESS))
return check.err;
tASSERT(txn, ((unsigned)check.page->flags & ~P_SPILLED) ==
(pageflags & ~P_FROZEN));
tASSERT(txn, !(pageflags & P_FROZEN) || is_frozen(txn, check.page));
}
if (pageflags & P_FROZEN) {
status = frozen;
if (ASSERT_ENABLED()) {
for (MDBX_txn *scan = txn; scan; scan = scan->parent) {
tASSERT(txn, !txn->tw.spilled.list || !spill_search(scan, pgno));
tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno));
}
}
goto status_done;
} else if (pageflags && txn->tw.dirtylist) {
if ((di = dpl_exist(txn, pgno)) != 0) {
mp = txn->tw.dirtylist->items[di].ptr;
tASSERT(txn, is_modifable(txn, mp));
status = modifable;
goto status_done;
}
if ((si = spill_search(txn, pgno)) != 0) {
status = spilled;
goto status_done;
}
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
if (dpl_exist(parent, pgno)) {
status = shadowed;
goto status_done;
}
if (spill_search(parent, pgno)) {
status = spilled;
goto status_done;
}
}
status = frozen;
goto status_done;
}
pgr_t pg = page_get_any(mc, pgno, txn->front_txnid);
if (unlikely(pg.err != MDBX_SUCCESS))
return pg.err;
mp = pg.page;
tASSERT(txn, !pageflags || mp->flags == pageflags);
pageflags = mp->flags;
}
if (is_frozen(txn, mp)) {
status = frozen;
tASSERT(txn, !is_modifable(txn, mp));
tASSERT(txn, !is_spilled(txn, mp));
tASSERT(txn, !is_shadowed(txn, mp));
tASSERT(txn, !debug_dpl_find(txn, pgno));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
} else if (is_modifable(txn, mp)) {
status = modifable;
if (txn->tw.dirtylist)
di = dpl_exist(txn, pgno);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) || !is_spilled(txn, mp));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
} else if (is_shadowed(txn, mp)) {
status = shadowed;
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
tASSERT(txn, !debug_dpl_find(txn, pgno));
} else {
tASSERT(txn, is_spilled(txn, mp));
status = spilled;
si = spill_search(txn, pgno);
tASSERT(txn, !debug_dpl_find(txn, pgno));
}
status_done:
if (likely((pageflags & P_LARGE) == 0)) {
STATIC_ASSERT(P_BRANCH == 1);
const bool is_branch = pageflags & P_BRANCH;
cASSERT(mc, ((pageflags & P_LEAF) == 0) == is_branch);
if (unlikely(mc->flags & z_inner)) {
tree_t *outer = outer_tree(mc);
cASSERT(mc, !is_branch || outer->branch_pages > 0);
outer->branch_pages -= is_branch;
cASSERT(mc, is_branch || outer->leaf_pages > 0);
outer->leaf_pages -= 1 - is_branch;
}
cASSERT(mc, !is_branch || mc->tree->branch_pages > 0);
mc->tree->branch_pages -= is_branch;
cASSERT(mc, is_branch || mc->tree->leaf_pages > 0);
mc->tree->leaf_pages -= 1 - is_branch;
} else {
npages = mp->pages;
cASSERT(mc, mc->tree->large_pages >= npages);
mc->tree->large_pages -= (pgno_t)npages;
}
if (status == frozen) {
retire:
DEBUG("retire %zu page %" PRIaPGNO, npages, pgno);
rc = pnl_append_span(&txn->tw.retired_pages, pgno, npages);
tASSERT(txn, dpl_check(txn));
return rc;
}
/* Возврат страниц в нераспределенный "хвост" БД.
* Содержимое страниц не уничтожается, а для вложенных транзакций граница
* нераспределенного "хвоста" БД сдвигается только при их коммите. */
if (MDBX_ENABLE_REFUND &&
unlikely(pgno + npages == txn->geo.first_unallocated)) {
const char *kind = nullptr;
if (status == modifable) {
/* Страница испачкана в этой транзакции, но до этого могла быть
* аллоцирована, испачкана и пролита в одной из родительских транзакций.
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
kind = "dirty";
/* Remove from dirty list */
page_wash(txn, di, mp, npages);
} else if (si) {
/* Страница пролита в этой транзакции, т.е. она аллоцирована
* и запачкана в этой или одной из родительских транзакций.
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
kind = "spilled";
tASSERT(txn, status == spilled);
spill_remove(txn, si, npages);
} else {
/* Страница аллоцирована, запачкана и возможно пролита в одной
* из родительских транзакций.
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
kind = "parent's";
if (ASSERT_ENABLED() && mp) {
kind = nullptr;
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
if (spill_search(parent, pgno)) {
kind = "parent-spilled";
tASSERT(txn, status == spilled);
break;
}
if (mp == debug_dpl_find(parent, pgno)) {
kind = "parent-dirty";
tASSERT(txn, status == shadowed);
break;
}
}
tASSERT(txn, kind != nullptr);
}
tASSERT(txn, status == spilled || status == shadowed);
}
DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno);
txn->geo.first_unallocated = pgno;
txn_refund(txn);
return MDBX_SUCCESS;
}
if (status == modifable) {
/* Dirty page from this transaction */
/* If suitable we can reuse it through loose list */
if (likely(npages == 1 && suitable4loose(txn, pgno)) &&
(di || !txn->tw.dirtylist)) {
DEBUG("loosen dirty page %" PRIaPGNO, pgno);
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
memset(page_data(mp), -1, txn->env->ps - PAGEHDRSZ);
mp->txnid = INVALID_TXNID;
mp->flags = P_LOOSE;
page_next(mp) = txn->tw.loose_pages;
txn->tw.loose_pages = mp;
txn->tw.loose_count++;
#if MDBX_ENABLE_REFUND
txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl)
? pgno + 2
: txn->tw.loose_refund_wl;
#endif /* MDBX_ENABLE_REFUND */
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->env->ps - PAGEHDRSZ);
MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), txn->env->ps - PAGEHDRSZ);
return MDBX_SUCCESS;
}
#if !MDBX_DEBUG && !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
if (unlikely(txn->env->flags & MDBX_PAGEPERTURB))
#endif
{
/* Страница могла быть изменена в одной из родительских транзакций,
* в том числе, позже выгружена и затем снова загружена и изменена.
* В обоих случаях её нельзя затирать на диске и помечать недоступной
* в asan и/или valgrind */
for (MDBX_txn *parent = txn->parent;
parent && (parent->flags & MDBX_TXN_SPILLS);
parent = parent->parent) {
if (spill_intersect(parent, pgno, npages))
goto skip_invalidate;
if (dpl_intersect(parent, pgno, npages))
goto skip_invalidate;
}
#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
#endif
page_kill(txn, mp, pgno, npages);
if ((txn->flags & MDBX_WRITEMAP) == 0) {
VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->env, pgno)),
pgno2bytes(txn->env, npages) - PAGEHDRSZ);
MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->env, pgno)),
pgno2bytes(txn->env, npages) -
PAGEHDRSZ);
}
}
skip_invalidate:
/* wash dirty page */
page_wash(txn, di, mp, npages);
reclaim:
DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno);
rc = pnl_insert_span(&txn->tw.relist, pgno, npages);
tASSERT(txn,
pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
MDBX_ENABLE_REFUND));
tASSERT(txn, dpl_check(txn));
return rc;
}
if (si) {
/* Page ws spilled in this txn */
spill_remove(txn, si, npages);
/* Страница могла быть выделена и затем пролита в этой транзакции,
* тогда её необходимо поместить в reclaimed-список.
* Либо она могла быть выделена в одной из родительских транзакций и затем
* пролита в этой транзакции, тогда её необходимо поместить в
* retired-список для последующей фильтрации при коммите. */
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
if (dpl_exist(parent, pgno))
goto retire;
}
/* Страница точно была выделена в этой транзакции
* и теперь может быть использована повторно. */
goto reclaim;
}
if (status == shadowed) {
/* Dirty page MUST BE a clone from (one of) parent transaction(s). */
if (ASSERT_ENABLED()) {
const page_t *parent_dp = nullptr;
/* Check parent(s)'s dirty lists. */
for (MDBX_txn *parent = txn->parent; parent && !parent_dp;
parent = parent->parent) {
tASSERT(txn, !spill_search(parent, pgno));
parent_dp = debug_dpl_find(parent, pgno);
}
tASSERT(txn, parent_dp && (!mp || parent_dp == mp));
}
/* Страница была выделена в родительской транзакции и теперь может быть
* использована повторно, но только внутри этой транзакции, либо дочерних.
*/
goto reclaim;
}
/* Страница может входить в доступный читателям MVCC-снимок, либо же она
* могла быть выделена, а затем пролита в одной из родительских
* транзакций. Поэтому пока помещаем её в retired-список, который будет
* фильтроваться относительно dirty- и spilled-списков родительских
* транзакций при коммите дочерних транзакций, либо же будет записан
* в GC в неизменном виде. */
goto retire;
}
__hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp,
size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
mp->txnid = txn->front_txnid;
if (!txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
txn->tw.writemap_dirty_npages += npages;
tASSERT(txn, txn->tw.spilled.list == nullptr);
return MDBX_SUCCESS;
}
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
#if xMDBX_DEBUG_SPILLING == 2
txn->env->debug_dirtied_act += 1;
ENSURE(txn->env, txn->env->debug_dirtied_act < txn->env->debug_dirtied_est);
ENSURE(txn->env, txn->tw.dirtyroom + txn->tw.loose_count > 0);
#endif /* xMDBX_DEBUG_SPILLING == 2 */
int rc;
if (unlikely(txn->tw.dirtyroom == 0)) {
if (txn->tw.loose_count) {
page_t *lp = txn->tw.loose_pages;
DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->pgno);
rc = pnl_insert_span(&txn->tw.relist, lp->pgno, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
size_t di = dpl_search(txn, lp->pgno);
tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp);
dpl_remove(txn, di);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
txn->tw.loose_pages = page_next(lp);
txn->tw.loose_count--;
txn->tw.dirtyroom++;
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, lp, 1);
} else {
ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length);
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, mp, npages);
return MDBX_TXN_FULL;
}
}
rc = dpl_append(txn, mp->pgno, mp, npages);
if (unlikely(rc != MDBX_SUCCESS)) {
bailout:
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
txn->tw.dirtyroom--;
tASSERT(txn, dpl_check(txn));
return MDBX_SUCCESS;
}
size_t page_subleaf2_reserve(const MDBX_env *const env, size_t host_page_room,
size_t subpage_len, size_t item_len) {
eASSERT(env, (subpage_len & 1) == 0);
eASSERT(env, env->subpage_reserve_prereq > env->subpage_room_threshold +
env->subpage_reserve_limit &&
env->leaf_nodemax >= env->subpage_limit + NODESIZE);
size_t reserve = 0;
for (size_t n = 0;
n < 5 && reserve + item_len <= env->subpage_reserve_limit &&
EVEN_CEIL(subpage_len + item_len) <= env->subpage_limit &&
host_page_room >=
env->subpage_reserve_prereq + EVEN_CEIL(subpage_len + item_len);
++n) {
subpage_len += item_len;
reserve += item_len;
}
return reserve + (subpage_len & 1);
}

179
src/page-ops.h Normal file
View File

@ -0,0 +1,179 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
MDBX_INTERNAL int __must_check_result tree_search_finalize(MDBX_cursor *mc,
const MDBX_val *key,
int flags);
MDBX_INTERNAL int tree_search_lowest(MDBX_cursor *mc);
enum page_search_flags {
Z_MODIFY = 1,
Z_ROOTONLY = 2,
Z_FIRST = 4,
Z_LAST = 8,
};
MDBX_INTERNAL int __must_check_result tree_search(MDBX_cursor *mc,
const MDBX_val *key,
int flags);
#define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */
MDBX_INTERNAL int __must_check_result page_split(MDBX_cursor *mc,
const MDBX_val *const newkey,
MDBX_val *const newdata,
pgno_t newpgno,
const unsigned naf);
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL int MDBX_PRINTF_ARGS(2, 3)
bad_page(const page_t *mp, const char *fmt, ...);
MDBX_INTERNAL void MDBX_PRINTF_ARGS(2, 3)
poor_page(const page_t *mp, const char *fmt, ...);
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_frozen(const MDBX_txn *txn,
const page_t *mp) {
return mp->txnid < txn->txnid;
}
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_spilled(const MDBX_txn *txn,
const page_t *mp) {
return mp->txnid == txn->txnid;
}
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_shadowed(const MDBX_txn *txn,
const page_t *mp) {
return mp->txnid > txn->txnid;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_correct(const MDBX_txn *txn, const page_t *mp) {
return mp->txnid <= txn->front_txnid;
}
MDBX_NOTHROW_PURE_FUNCTION static inline bool is_modifable(const MDBX_txn *txn,
const page_t *mp) {
return mp->txnid == txn->front_txnid;
}
MDBX_INTERNAL int __must_check_result page_check(const MDBX_cursor *const mc,
const page_t *const mp);
MDBX_INTERNAL pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno,
const txnid_t front);
MDBX_INTERNAL pgr_t page_get_three(const MDBX_cursor *const mc,
const pgno_t pgno, const txnid_t front);
MDBX_INTERNAL pgr_t page_get_large(const MDBX_cursor *const mc,
const pgno_t pgno, const txnid_t front);
static inline int __must_check_result page_get(const MDBX_cursor *mc,
const pgno_t pgno, page_t **mp,
const txnid_t front) {
pgr_t ret = page_get_three(mc, pgno, front);
*mp = ret.page;
return ret.err;
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp,
size_t npages);
MDBX_INTERNAL pgr_t page_new(MDBX_cursor *mc, const unsigned flags);
MDBX_INTERNAL pgr_t page_new_large(MDBX_cursor *mc, const size_t npages);
MDBX_INTERNAL int page_touch_modifable(MDBX_txn *txn, const page_t *const mp);
MDBX_INTERNAL int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc,
const page_t *const mp);
static inline int page_touch(MDBX_cursor *mc) {
page_t *const mp = mc->pg[mc->top];
MDBX_txn *txn = mc->txn;
tASSERT(txn, mc->txn->flags & MDBX_TXN_DIRTY);
tASSERT(txn,
F_ISSET(*cursor_dbi_state(mc), DBI_LINDO | DBI_VALID | DBI_DIRTY));
tASSERT(txn, !is_largepage(mp));
if (ASSERT_ENABLED()) {
if (mc->flags & z_inner) {
subcur_t *mx = container_of(mc->tree, subcur_t, nested_tree);
cursor_couple_t *couple = container_of(mx, cursor_couple_t, inner);
tASSERT(txn, mc->tree == &couple->outer.subcur->nested_tree);
tASSERT(txn, &mc->clc->k == &couple->outer.clc->v);
tASSERT(txn, *couple->outer.dbi_state & DBI_DIRTY);
}
tASSERT(txn, dpl_check(txn));
}
if (is_modifable(txn, mp)) {
if (!txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC);
return MDBX_SUCCESS;
}
return is_subpage(mp) ? MDBX_SUCCESS : page_touch_modifable(txn, mp);
}
return page_touch_unmodifable(txn, mc, mp);
}
MDBX_INTERNAL void page_copy(page_t *const dst, const page_t *const src,
const size_t size);
MDBX_INTERNAL pgr_t __must_check_result page_unspill(MDBX_txn *const txn,
const page_t *const mp);
MDBX_INTERNAL page_t *page_shadow_alloc(MDBX_txn *txn, size_t num);
MDBX_INTERNAL void page_shadow_release(MDBX_env *env, page_t *dp,
size_t npages);
MDBX_INTERNAL int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno,
page_t *mp /* maybe null */,
unsigned pageflags /* maybe unknown/zero */);
static inline int page_retire(MDBX_cursor *mc, page_t *mp) {
return page_retire_ex(mc, mp->pgno, mp, mp->flags);
}
static inline void page_wash(MDBX_txn *txn, size_t di, page_t *const mp,
const size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
mp->txnid = INVALID_TXNID;
mp->flags = P_BAD;
if (txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
tASSERT(txn,
MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp));
if (!MDBX_AVOID_MSYNC || di) {
dpl_remove_ex(txn, di, npages);
txn->tw.dirtyroom++;
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom
: txn->env->options.dp_limit));
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) {
page_shadow_release(txn->env, mp, npages);
return;
}
}
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di);
txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages)
? npages
: txn->tw.writemap_dirty_npages;
}
VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
pgno2bytes(txn->env, npages) - PAGEHDRSZ);
MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp),
pgno2bytes(txn->env, npages) - PAGEHDRSZ);
}
MDBX_INTERNAL size_t page_subleaf2_reserve(const MDBX_env *const env,
size_t host_page_room,
size_t subpage_len, size_t item_len);
#define page_next(mp) \
(*(page_t **)ptr_disp((mp)->entries, sizeof(void *) - sizeof(uint32_t)))

147
src/page-search.c Normal file
View File

@ -0,0 +1,147 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
/* Search for the lowest key under the current branch page.
* This just bypasses a numkeys check in the current page
* before calling tree_search_finalize(), because the callers
* are all in situations where the current page is known to
* be underfilled. */
__hot int tree_search_lowest(MDBX_cursor *mc) {
cASSERT(mc, mc->top >= 0);
page_t *mp = mc->pg[mc->top];
cASSERT(mc, is_branch(mp));
node_t *node = page_node(mp, 0);
int err = page_get(mc, node_pgno(node), &mp, mp->txnid);
if (unlikely(err != MDBX_SUCCESS))
return err;
mc->ki[mc->top] = 0;
err = cursor_push(mc, mp, 0);
if (unlikely(err != MDBX_SUCCESS))
return err;
return tree_search_finalize(mc, nullptr, Z_FIRST);
}
__hot int tree_search(MDBX_cursor *mc, const MDBX_val *key, int flags) {
int err;
if (unlikely(mc->txn->flags & MDBX_TXN_BLOCKED)) {
DEBUG("%s", "transaction has failed, must abort");
err = MDBX_BAD_TXN;
bailout:
be_poor(mc);
return err;
}
const size_t dbi = cursor_dbi(mc);
if (unlikely(*cursor_dbi_state(mc) & DBI_STALE)) {
err = sdb_fetch(mc->txn, dbi);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
}
const pgno_t root = mc->tree->root;
if (unlikely(root == P_INVALID)) {
DEBUG("%s", "tree is empty");
cASSERT(mc, is_poor(mc));
return MDBX_NOTFOUND;
}
cASSERT(mc, root >= NUM_METAS && root < mc->txn->geo.first_unallocated);
if (mc->top < 0 || mc->pg[0]->pgno != root) {
txnid_t pp_txnid = mc->tree->mod_txnid;
pp_txnid = /* tree->mod_txnid maybe zero in a legacy DB */ pp_txnid
? pp_txnid
: mc->txn->txnid;
if ((mc->txn->flags & MDBX_TXN_RDONLY) == 0) {
MDBX_txn *scan = mc->txn;
do
if ((scan->flags & MDBX_TXN_DIRTY) &&
(dbi == MAIN_DBI || (scan->dbi_state[dbi] & DBI_DIRTY))) {
/* После коммита вложенных тразакций может быть mod_txnid > front */
pp_txnid = scan->front_txnid;
break;
}
while (unlikely((scan = scan->parent) != nullptr));
}
err = page_get(mc, root, &mc->pg[0], pp_txnid);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
}
mc->top = 0;
mc->ki[0] = (flags & Z_LAST) ? page_numkeys(mc->pg[0]) - 1 : 0;
DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", cursor_dbi_dbg(mc),
root, mc->pg[0]->flags);
if (flags & Z_MODIFY) {
err = page_touch(mc);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
}
if (flags & Z_ROOTONLY)
return MDBX_SUCCESS;
return tree_search_finalize(mc, key, flags);
}
__hot __noinline int tree_search_finalize(MDBX_cursor *mc, const MDBX_val *key,
int flags) {
cASSERT(mc, !is_poor(mc));
DKBUF_DEBUG;
int err;
page_t *mp = mc->pg[mc->top];
intptr_t ki = (flags & Z_FIRST) ? 0 : page_numkeys(mp) - 1;
while (is_branch(mp)) {
DEBUG("branch page %" PRIaPGNO " has %zu keys", mp->pgno, page_numkeys(mp));
cASSERT(mc, page_numkeys(mp) > 1);
DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0)));
if ((flags & (Z_FIRST | Z_LAST)) == 0) {
const struct node_search_result nsr = node_search(mc, key);
if (likely(nsr.node))
ki = mc->ki[mc->top] + (intptr_t)nsr.exact - 1;
DEBUG("following index %zu for key [%s]", ki, DKEY_DEBUG(key));
}
err = page_get(mc, node_pgno(page_node(mp, ki)), &mp, mp->txnid);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
mc->ki[mc->top] = (indx_t)ki;
ki = (flags & Z_FIRST) ? 0 : page_numkeys(mp) - 1;
err = cursor_push(mc, mp, ki);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
if (flags & Z_MODIFY) {
err = page_touch(mc);
if (unlikely(err != MDBX_SUCCESS))
goto bailout;
mp = mc->pg[mc->top];
}
}
if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) {
ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
mp->pgno, mp->flags);
err = MDBX_CORRUPTED;
bailout:
be_poor(mc);
return err;
}
DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->pgno,
DKEY_DEBUG(key));
/* Логически верно, но (в текущем понимании) нет необходимости.
Однако, стоит ещё по-проверять/по-тестировать.
Возможно есть сценарий, в котором очистка флагов всё-таки требуется.
be_filled(mc); */
return MDBX_SUCCESS;
}

254
src/pnl.c Normal file
View File

@ -0,0 +1,254 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
MDBX_INTERNAL pnl_t pnl_alloc(size_t size) {
size_t bytes = pnl_size2bytes(size);
pnl_t pnl = osal_malloc(bytes);
if (likely(pnl)) {
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
bytes = malloc_usable_size(pnl);
#endif /* malloc_usable_size */
pnl[0] = pnl_bytes2size(bytes);
assert(pnl[0] >= size);
pnl += 1;
*pnl = 0;
}
return pnl;
}
MDBX_INTERNAL void pnl_free(pnl_t pnl) {
if (likely(pnl))
osal_free(pnl - 1);
}
MDBX_INTERNAL void pnl_shrink(pnl_t __restrict *__restrict ppnl) {
assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL &&
pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) <
MDBX_PNL_INITIAL * 3 / 2);
assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT &&
MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl));
MDBX_PNL_SETSIZE(*ppnl, 0);
if (unlikely(MDBX_PNL_ALLOCLEN(*ppnl) >
MDBX_PNL_INITIAL * (MDBX_PNL_PREALLOC_FOR_RADIXSORT ? 8 : 4) -
MDBX_CACHELINE_SIZE / sizeof(pgno_t))) {
size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL * 2);
pnl_t pnl = osal_realloc(*ppnl - 1, bytes);
if (likely(pnl)) {
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
bytes = malloc_usable_size(pnl);
#endif /* malloc_usable_size */
*pnl = pnl_bytes2size(bytes);
*ppnl = pnl + 1;
}
}
}
MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl,
const size_t wanna) {
const size_t allocated = MDBX_PNL_ALLOCLEN(*ppnl);
assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT &&
MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl));
if (likely(allocated >= wanna))
return MDBX_SUCCESS;
if (unlikely(wanna > /* paranoia */ PAGELIST_LIMIT)) {
ERROR("PNL too long (%zu > %zu)", wanna, (size_t)PAGELIST_LIMIT);
return MDBX_TXN_FULL;
}
const size_t size = (wanna + wanna - allocated < PAGELIST_LIMIT)
? wanna + wanna - allocated
: PAGELIST_LIMIT;
size_t bytes = pnl_size2bytes(size);
pnl_t pnl = osal_realloc(*ppnl - 1, bytes);
if (likely(pnl)) {
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
bytes = malloc_usable_size(pnl);
#endif /* malloc_usable_size */
*pnl = pnl_bytes2size(bytes);
assert(*pnl >= wanna);
*ppnl = pnl + 1;
return MDBX_SUCCESS;
}
return MDBX_ENOMEM;
}
static __always_inline int __must_check_result pnl_append_stepped(
unsigned step, __restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
assert(n > 0);
int rc = pnl_need(ppnl, n);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
const pnl_t pnl = *ppnl;
if (likely(n == 1)) {
pnl_append_prereserved(pnl, pgno);
return MDBX_SUCCESS;
}
#if MDBX_PNL_ASCENDING
size_t w = MDBX_PNL_GETSIZE(pnl);
do {
pnl[++w] = pgno;
pgno += step;
} while (--n);
MDBX_PNL_SETSIZE(pnl, w);
#else
size_t w = MDBX_PNL_GETSIZE(pnl) + n;
MDBX_PNL_SETSIZE(pnl, w);
do {
pnl[w--] = pgno;
pgno += step;
} while (--n);
#endif
return MDBX_SUCCESS;
}
__hot MDBX_INTERNAL int __must_check_result
spill_append_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
return pnl_append_stepped(2, ppnl, pgno << 1, n);
}
__hot MDBX_INTERNAL int __must_check_result
pnl_append_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
return pnl_append_stepped(1, ppnl, pgno, n);
}
__hot MDBX_INTERNAL int __must_check_result
pnl_insert_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) {
assert(n > 0);
int rc = pnl_need(ppnl, n);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
const pnl_t pnl = *ppnl;
size_t r = MDBX_PNL_GETSIZE(pnl), w = r + n;
MDBX_PNL_SETSIZE(pnl, w);
while (r && MDBX_PNL_DISORDERED(pnl[r], pgno))
pnl[w--] = pnl[r--];
for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w)
pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++;
return MDBX_SUCCESS;
}
__hot __noinline MDBX_INTERNAL bool pnl_check(const const_pnl_t pnl,
const size_t limit) {
assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND);
if (likely(MDBX_PNL_GETSIZE(pnl))) {
if (unlikely(MDBX_PNL_GETSIZE(pnl) > PAGELIST_LIMIT))
return false;
if (unlikely(MDBX_PNL_LEAST(pnl) < MIN_PAGENO))
return false;
if (unlikely(MDBX_PNL_MOST(pnl) >= limit))
return false;
if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) &&
likely(MDBX_PNL_GETSIZE(pnl) > 1)) {
const pgno_t *scan = MDBX_PNL_BEGIN(pnl);
const pgno_t *const end = MDBX_PNL_END(pnl);
pgno_t prev = *scan++;
do {
if (unlikely(!MDBX_PNL_ORDERED(prev, *scan)))
return false;
prev = *scan;
} while (likely(++scan != end));
}
}
return true;
}
static __always_inline void
pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a,
const pgno_t *__restrict src_b,
const pgno_t *__restrict const src_b_detent) {
do {
#if MDBX_HAVE_CMOV
const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a);
#if defined(__LCC__) || __CLANG_PREREQ(13, 0)
// lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode
// gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?)
// gcc<=6: cmov×3
// clang<=12: cmov×3
// clang>=13: cmov, set+add/sub
*dst = flag ? *src_a-- : *src_b--;
#else
// gcc: cmov, cmp+set+add/sub
// clang<=5: cmov×2, set+add/sub
// clang>=6: cmov, set+add/sub
*dst = flag ? *src_a : *src_b;
src_b += (ptrdiff_t)flag - 1;
src_a -= flag;
#endif
--dst;
#else /* MDBX_HAVE_CMOV */
while (MDBX_PNL_ORDERED(*src_b, *src_a))
*dst-- = *src_a--;
*dst-- = *src_b--;
#endif /* !MDBX_HAVE_CMOV */
} while (likely(src_b > src_b_detent));
}
__hot MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src) {
assert(pnl_check_allocated(dst, MAX_PAGENO + 1));
assert(pnl_check(src, MAX_PAGENO + 1));
const size_t src_len = MDBX_PNL_GETSIZE(src);
const size_t dst_len = MDBX_PNL_GETSIZE(dst);
size_t total = dst_len;
assert(MDBX_PNL_ALLOCLEN(dst) >= total);
if (likely(src_len > 0)) {
total += src_len;
if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12))
goto avoid_call_libc_for_short_cases;
if (dst_len == 0 ||
MDBX_PNL_ORDERED(MDBX_PNL_LAST(dst), MDBX_PNL_FIRST(src)))
memcpy(MDBX_PNL_END(dst), MDBX_PNL_BEGIN(src), src_len * sizeof(pgno_t));
else if (MDBX_PNL_ORDERED(MDBX_PNL_LAST(src), MDBX_PNL_FIRST(dst))) {
memmove(MDBX_PNL_BEGIN(dst) + src_len, MDBX_PNL_BEGIN(dst),
dst_len * sizeof(pgno_t));
memcpy(MDBX_PNL_BEGIN(dst), MDBX_PNL_BEGIN(src),
src_len * sizeof(pgno_t));
} else {
avoid_call_libc_for_short_cases:
dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID);
pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src);
}
MDBX_PNL_SETSIZE(dst, total);
}
assert(pnl_check_allocated(dst, MAX_PAGENO + 1));
return total;
}
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr))
#else
#define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr))
#endif
RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY,
MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0)
SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED)
__hot __noinline MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl) {
if (likely(MDBX_PNL_GETSIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) ||
unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_GETSIZE(pnl))))
pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl));
}
SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED)
__hot __noinline MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl,
pgno_t pgno) {
const pgno_t *begin = MDBX_PNL_BEGIN(pnl);
const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_GETSIZE(pnl), pgno);
const pgno_t *end = begin + MDBX_PNL_GETSIZE(pnl);
assert(it >= begin && it <= end);
if (it != begin)
assert(MDBX_PNL_ORDERED(it[-1], pgno));
if (it != end)
assert(!MDBX_PNL_ORDERED(it[0], pgno));
return it - begin + 1;
}

161
src/pnl.h Normal file
View File

@ -0,0 +1,161 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
/* An PNL is an Page Number List, a sorted array of IDs.
*
* The first element of the array is a counter for how many actual page-numbers
* are in the list. By default PNLs are sorted in descending order, this allow
* cut off a page with lowest pgno (at the tail) just truncating the list. The
* sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *pnl_t;
typedef const pgno_t *const_pnl_t;
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_ORDERED(first, last) ((first) < (last))
#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
#else
#define MDBX_PNL_ORDERED(first, last) ((first) > (last))
#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
#endif
#define MDBX_PNL_GRANULATE_LOG2 10
#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2)
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0]))
#define MDBX_PNL_SETSIZE(pl, size) \
do { \
const size_t __size = size; \
assert(__size < INT_MAX); \
(pl)[0] = (pgno_t)__size; \
} while (0)
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)])
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1])
#if MDBX_PNL_ASCENDING
#define MDBX_PNL_EDGE(pl) ((pl) + 1)
#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
#else
#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl))
#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
#endif
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
assert(size > 0 && size <= PAGELIST_LIMIT);
#if MDBX_PNL_PREALLOC_FOR_RADIXSORT
size += size;
#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD +
(PAGELIST_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) +
MDBX_PNL_GRANULATE + 3) *
sizeof(pgno_t) <
SIZE_MAX / 4 * 3);
size_t bytes =
ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 3),
MDBX_PNL_GRANULATE * sizeof(pgno_t)) -
MDBX_ASSUME_MALLOC_OVERHEAD;
return bytes;
}
MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
size_t size = bytes / sizeof(pgno_t);
assert(size > 3 && size <= PAGELIST_LIMIT + /* alignment gap */ 65536);
size -= 3;
#if MDBX_PNL_PREALLOC_FOR_RADIXSORT
size >>= 1;
#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
return (pgno_t)size;
}
MDBX_INTERNAL pnl_t pnl_alloc(size_t size);
MDBX_INTERNAL void pnl_free(pnl_t pnl);
MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl,
const size_t wanna);
MDBX_MAYBE_UNUSED static inline int __must_check_result
pnl_need(pnl_t __restrict *__restrict ppnl, size_t num) {
assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT &&
MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl));
assert(num <= PAGELIST_LIMIT);
const size_t wanna = MDBX_PNL_GETSIZE(*ppnl) + num;
return likely(MDBX_PNL_ALLOCLEN(*ppnl) >= wanna) ? MDBX_SUCCESS
: pnl_reserve(ppnl, wanna);
}
MDBX_MAYBE_UNUSED static inline void
pnl_append_prereserved(__restrict pnl_t pnl, pgno_t pgno) {
assert(MDBX_PNL_GETSIZE(pnl) < MDBX_PNL_ALLOCLEN(pnl));
if (AUDIT_ENABLED()) {
for (size_t i = MDBX_PNL_GETSIZE(pnl); i > 0; --i)
assert(pgno != pnl[i]);
}
*pnl += 1;
MDBX_PNL_LAST(pnl) = pgno;
}
MDBX_INTERNAL void pnl_shrink(pnl_t __restrict *__restrict ppnl);
MDBX_INTERNAL int __must_check_result spill_append_span(__restrict pnl_t *ppnl,
pgno_t pgno, size_t n);
MDBX_INTERNAL int __must_check_result pnl_append_span(__restrict pnl_t *ppnl,
pgno_t pgno, size_t n);
MDBX_INTERNAL int __must_check_result pnl_insert_span(__restrict pnl_t *ppnl,
pgno_t pgno, size_t n);
MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl);
MDBX_INTERNAL bool pnl_check(const const_pnl_t pnl, const size_t limit);
MDBX_MAYBE_UNUSED static inline bool pnl_check_allocated(const const_pnl_t pnl,
const size_t limit) {
return pnl == nullptr || (MDBX_PNL_ALLOCLEN(pnl) >= MDBX_PNL_GETSIZE(pnl) &&
pnl_check(pnl, limit));
}
MDBX_MAYBE_UNUSED static inline void pnl_sort(pnl_t pnl, size_t limit4check) {
pnl_sort_nochk(pnl);
assert(pnl_check(pnl, limit4check));
(void)limit4check;
}
MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno,
size_t limit) {
assert(pnl_check_allocated(pnl, limit));
if (MDBX_HAVE_CMOV) {
/* cmov-ускоренный бинарный поиск может читать (но не использовать) один
* элемент за концом данных, этот элемент в пределах выделенного участка
* памяти, но не инициализирован. */
VALGRIND_MAKE_MEM_DEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t));
}
assert(pgno < limit);
(void)limit;
size_t n = pnl_search_nochk(pnl, pgno);
if (MDBX_HAVE_CMOV) {
VALGRIND_MAKE_MEM_UNDEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t));
}
return n;
}
MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src);

View File

@ -1,19 +1,162 @@
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
#if (defined(MDBX_DEBUG) && MDBX_DEBUG > 0) || \
(defined(MDBX_FORCE_ASSERTIONS) && MDBX_FORCE_ASSERTIONS)
#undef NDEBUG
#endif
/*----------------------------------------------------------------------------*/
/** Disables using GNU/Linux libc extensions.
* \ingroup build_option
* \note This option couldn't be moved to the options.h since dependent
* control macros/defined should be prepared before include the options.h */
#ifndef MDBX_DISABLE_GNU_SOURCE
#define MDBX_DISABLE_GNU_SOURCE 0
#endif
#if MDBX_DISABLE_GNU_SOURCE
#undef _GNU_SOURCE
#elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE)
#define _GNU_SOURCE
#endif /* MDBX_DISABLE_GNU_SOURCE */
/* Should be defined before any includes */
#if !defined(_FILE_OFFSET_BITS) && !defined(__ANDROID_API__) && \
!defined(ANDROID)
#define _FILE_OFFSET_BITS 64
#endif /* _FILE_OFFSET_BITS */
#if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE)
#define _DARWIN_C_SOURCE
#endif /* _DARWIN_C_SOURCE */
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \
!defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS)
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 /* Windows 7 */
#endif /* _WIN32_WINNT */
#if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS
#endif /* _CRT_SECURE_NO_WARNINGS */
#if !defined(UNICODE)
#define UNICODE
#endif /* UNICODE */
#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \
!defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
#define _NO_CRT_STDIO_INLINE
#endif /* _NO_CRT_STDIO_INLINE */
#elif !defined(_POSIX_C_SOURCE)
#define _POSIX_C_SOURCE 200809L
#endif /* Windows */
#ifdef __cplusplus
#ifndef NOMINMAX
#define NOMINMAX
#endif /* NOMINMAX */
/* Workaround for modern libstdc++ with CLANG < 4.x */
#if defined(__SIZEOF_INT128__) && !defined(__GLIBCXX_TYPE_INT_N_0) && \
defined(__clang__) && __clang_major__ < 4
#define __GLIBCXX_BITSIZE_INT_N_0 128
#define __GLIBCXX_TYPE_INT_N_0 __int128
#endif /* Workaround for modern libstdc++ with CLANG < 4.x */
#ifdef _MSC_VER
/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */
#if defined(__SANITIZE_ADDRESS__) && !defined(_DISABLE_VECTOR_ANNOTATION)
#define _DISABLE_VECTOR_ANNOTATION
#endif /* _DISABLE_VECTOR_ANNOTATION */
#endif /* _MSC_VER */
#endif /* __cplusplus */
#ifdef _MSC_VER
#if _MSC_FULL_VER < 190024234
/* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual
* Studio 2015 Update 3). But you could remove this #error and try to continue
* at your own risk. In such case please don't rise up an issues related ONLY to
* old compilers.
*
* NOTE:
* Unfortunately, there are several different builds of "Visual Studio" that
* are called "Visual Studio 2015 Update 3".
*
* The 190024234 is used here because it is minimal version of Visual Studio
* that was used for build and testing libmdbx in recent years. Soon this
* value will be increased to 19.0.24241.7, since build and testing using
* "Visual Studio 2015" will be performed only at https://ci.appveyor.com.
*
* Please ask Microsoft (but not us) for information about version differences
* and how to and where you can obtain the latest "Visual Studio 2015" build
* with all fixes.
*/
#error \
"At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required."
#endif
#if _MSC_VER > 1800
#pragma warning(disable : 4464) /* relative include path contains '..' */
#endif
#if _MSC_VER > 1913
#pragma warning(disable : 5045) /* will insert Spectre mitigation... */
#endif
#if _MSC_VER > 1914
#pragma warning( \
disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \
producing 'defined' has undefined behavior */
#endif
#if _MSC_VER < 1920
/* avoid "error C2219: syntax error: type qualifier must be after '*'" */
#define __restrict
#endif
#if _MSC_VER > 1930
#pragma warning(disable : 6235) /* <expression> is always a constant */
#pragma warning(disable : 6237) /* <expression> is never evaluated and might \
have side effects */
#endif
#pragma warning(disable : 4710) /* 'xyz': function not inlined */
#pragma warning(disable : 4711) /* function 'xyz' selected for automatic \
inline expansion */
#pragma warning(disable : 4201) /* nonstandard extension used: nameless \
struct/union */
#pragma warning(disable : 4702) /* unreachable code */
#pragma warning(disable : 4706) /* assignment within conditional expression */
#pragma warning(disable : 4127) /* conditional expression is constant */
#pragma warning(disable : 4324) /* 'xyz': structure was padded due to \
alignment specifier */
#pragma warning(disable : 4310) /* cast truncates constant value */
#pragma warning(disable : 4820) /* bytes padding added after data member for \
alignment */
#pragma warning(disable : 4548) /* expression before comma has no effect; \
expected expression with side - effect */
#pragma warning(disable : 4366) /* the result of the unary '&' operator may be \
unaligned */
#pragma warning(disable : 4200) /* nonstandard extension used: zero-sized \
array in struct/union */
#pragma warning(disable : 4204) /* nonstandard extension used: non-constant \
aggregate initializer */
#pragma warning( \
disable : 4505) /* unreferenced local function has been removed */
#endif /* _MSC_VER (warnings) */
#if defined(__GNUC__) && __GNUC__ < 9
#pragma GCC diagnostic ignored "-Wattributes"
#endif /* GCC < 9 */
#include "../mdbx.h"
/*----------------------------------------------------------------------------*/
/* Microsoft compiler generates a lot of warning for self includes... */
@ -28,20 +171,9 @@
* not guaranteed. Specify /EHsc */
#endif /* _MSC_VER (warnings) */
#if defined(_WIN32) || defined(_WIN64)
#if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS
#endif /* _CRT_SECURE_NO_WARNINGS */
#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \
!defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
#define _NO_CRT_STDIO_INLINE
#endif
#elif !defined(_POSIX_C_SOURCE)
#define _POSIX_C_SOURCE 200809L
#endif /* Windows */
/*----------------------------------------------------------------------------*/
/* basic C99 includes */
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
@ -55,21 +187,6 @@
#include <string.h>
#include <time.h>
#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
#error \
"Sanity checking failed: Two's complement, reasonably sized integer types"
#endif
#ifndef SSIZE_MAX
#define SSIZE_MAX INTPTR_MAX
#endif
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#else
#define MDBX_WORDBITS 32
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
/* feature testing */
@ -81,6 +198,14 @@
#define __has_include(x) (0)
#endif
#ifndef __has_attribute
#define __has_attribute(x) (0)
#endif
#ifndef __has_cpp_attribute
#define __has_cpp_attribute(x) 0
#endif
#ifndef __has_feature
#define __has_feature(x) (0)
#endif
@ -89,6 +214,10 @@
#define __has_extension(x) (0)
#endif
#ifndef __has_builtin
#define __has_builtin(x) (0)
#endif
#if __has_feature(thread_sanitizer)
#define __SANITIZE_THREAD__ 1
#endif
@ -124,6 +253,47 @@
#endif
#endif /* __GLIBC_PREREQ */
/*----------------------------------------------------------------------------*/
/* pre-requirements */
#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
#error \
"Sanity checking failed: Two's complement, reasonably sized integer types"
#endif
#ifndef SSIZE_MAX
#define SSIZE_MAX INTPTR_MAX
#endif
#if defined(__GNUC__) && !__GNUC_PREREQ(4, 2)
/* Actually libmdbx was not tested with compilers older than GCC 4.2.
* But you could ignore this warning at your own risk.
* In such case please don't rise up an issues related ONLY to old compilers.
*/
#warning "libmdbx required GCC >= 4.2"
#endif
#if defined(__clang__) && !__CLANG_PREREQ(3, 8)
/* Actually libmdbx was not tested with CLANG older than 3.8.
* But you could ignore this warning at your own risk.
* In such case please don't rise up an issues related ONLY to old compilers.
*/
#warning "libmdbx required CLANG >= 3.8"
#endif
#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12)
/* Actually libmdbx was not tested with something older than glibc 2.12.
* But you could ignore this warning at your own risk.
* In such case please don't rise up an issues related ONLY to old systems.
*/
#warning "libmdbx was only tested with GLIBC >= 2.12."
#endif
#ifdef __SANITIZE_THREAD__
#warning \
"libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues."
#endif /* __SANITIZE_THREAD__ */
/*----------------------------------------------------------------------------*/
/* C11' alignas() */
@ -240,12 +410,14 @@ __extern_C key_t ftok(const char *, int);
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif /* WIN32_LEAN_AND_MEAN */
#include <excpt.h>
#include <tlhelp32.h>
#include <windows.h>
#include <winnt.h>
#include <winternl.h>
/* После подгрузки windows.h, чтобы избежать проблем со сборкой MINGW и т.п. */
#include <excpt.h>
#include <tlhelp32.h>
#else /*----------------------------------------------------------------------*/
#include <unistd.h>
@ -502,10 +674,11 @@ __extern_C key_t ftok(const char *, int);
#ifndef container_of
#define container_of(ptr, type, member) \
((type *)((char *)(ptr)-offsetof(type, member)))
((type *)((char *)(ptr) - offsetof(type, member)))
#endif /* container_of */
/*----------------------------------------------------------------------------*/
/* useful attributes */
#ifndef __always_inline
#if defined(__GNUC__) || __has_attribute(__always_inline__)
@ -513,7 +686,7 @@ __extern_C key_t ftok(const char *, int);
#elif defined(_MSC_VER)
#define __always_inline __forceinline
#else
#define __always_inline
#define __always_inline __inline
#endif
#endif /* __always_inline */
@ -639,16 +812,6 @@ __extern_C key_t ftok(const char *, int);
#endif
#endif /* __anonymous_struct_extension__ */
#ifndef expect_with_probability
#if defined(__builtin_expect_with_probability) || \
__has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0)
#define expect_with_probability(expr, value, prob) \
__builtin_expect_with_probability(expr, value, prob)
#else
#define expect_with_probability(expr, value, prob) (expr)
#endif
#endif /* expect_with_probability */
#ifndef MDBX_WEAK_IMPORT_ATTRIBUTE
#ifdef WEAK_IMPORT_ATTRIBUTE
#define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE
@ -662,6 +825,32 @@ __extern_C key_t ftok(const char *, int);
#endif
#endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */
#if !defined(__thread) && (defined(_MSC_VER) || defined(__DMC__))
#define __thread __declspec(thread)
#endif /* __thread */
#ifndef MDBX_EXCLUDE_FOR_GPROF
#ifdef ENABLE_GPROF
#define MDBX_EXCLUDE_FOR_GPROF \
__attribute__((__no_instrument_function__, \
__no_profile_instrument_function__))
#else
#define MDBX_EXCLUDE_FOR_GPROF
#endif /* ENABLE_GPROF */
#endif /* MDBX_EXCLUDE_FOR_GPROF */
/*----------------------------------------------------------------------------*/
#ifndef expect_with_probability
#if defined(__builtin_expect_with_probability) || \
__has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0)
#define expect_with_probability(expr, value, prob) \
__builtin_expect_with_probability(expr, value, prob)
#else
#define expect_with_probability(expr, value, prob) (expr)
#endif
#endif /* expect_with_probability */
#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER
#ifdef _PREFAST_
#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1
@ -684,7 +873,17 @@ __extern_C key_t ftok(const char *, int);
#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id)
#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */
#ifndef FLEXIBLE_ARRAY_MEMBERS
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
#define FLEXIBLE_ARRAY_MEMBERS 1
#else
#define FLEXIBLE_ARRAY_MEMBERS 0
#endif
#endif /* FLEXIBLE_ARRAY_MEMBERS */
/*----------------------------------------------------------------------------*/
/* Valgrind and Address Sanitizer */
#if defined(ENABLE_MEMCHECK)
#include <valgrind/memcheck.h>
@ -766,10 +965,69 @@ template <typename T, size_t N> char (&__ArraySizeHelper(T (&array)[N]))[N];
#define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr)
#endif
#ifndef __Wpedantic_format_voidptr
MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void *
__Wpedantic_format_voidptr(const void *ptr) {
return ptr;
}
#define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
#endif /* __Wpedantic_format_voidptr */
/*----------------------------------------------------------------------------*/
#if defined(_MSC_VER) && _MSC_VER >= 1900
/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
* for internal format-args checker. */
#undef PRIuPTR
#undef PRIiPTR
#undef PRIdPTR
#undef PRIxPTR
#define PRIuPTR "Iu"
#define PRIiPTR "Ii"
#define PRIdPTR "Id"
#define PRIxPTR "Ix"
#define PRIuSIZE "zu"
#define PRIiSIZE "zi"
#define PRIdSIZE "zd"
#define PRIxSIZE "zx"
#endif /* fix PRI*PTR for _MSC_VER */
#ifndef PRIuSIZE
#define PRIuSIZE PRIuPTR
#define PRIiSIZE PRIiPTR
#define PRIdSIZE PRIdPTR
#define PRIxSIZE PRIxPTR
#endif /* PRI*SIZE macros for MSVC */
#ifdef _MSC_VER
#pragma warning(pop)
#endif
/*----------------------------------------------------------------------------*/
#if __has_warning("-Wnested-anon-types")
#if defined(__clang__)
#pragma clang diagnostic ignored "-Wnested-anon-types"
#elif defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wnested-anon-types"
#else
#pragma warning disable "nested-anon-types"
#endif
#endif /* -Wnested-anon-types */
#if __has_warning("-Wconstant-logical-operand")
#if defined(__clang__)
#pragma clang diagnostic ignored "-Wconstant-logical-operand"
#elif defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wconstant-logical-operand"
#else
#pragma warning disable "constant-logical-operand"
#endif
#endif /* -Wconstant-logical-operand */
#if defined(__LCC__) && (__LCC__ <= 121)
/* bug #2798 */
#pragma diag_suppress alignment_reduction_ignored
#elif defined(__ICC)
#pragma warning(disable : 3453 1366)
#elif __has_warning("-Walignment-reduction-ignored")
#if defined(__clang__)
#pragma clang diagnostic ignored "-Walignment-reduction-ignored"
#elif defined(__GNUC__)
#pragma GCC diagnostic ignored "-Walignment-reduction-ignored"
#else
#pragma warning disable "alignment-reduction-ignored"
#endif
#endif /* -Walignment-reduction-ignored */

119
src/proto.h Normal file
View File

@ -0,0 +1,119 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
/* Internal prototypes */
/* audit.c */
MDBX_INTERNAL int audit_ex(MDBX_txn *txn, size_t retired_stored,
bool dont_filter_gc);
/* mvcc-readers.c */
MDBX_INTERNAL bsr_t mvcc_bind_slot(MDBX_env *env, const uintptr_t tid);
MDBX_MAYBE_UNUSED MDBX_INTERNAL pgno_t mvcc_largest_this(MDBX_env *env,
pgno_t largest);
MDBX_INTERNAL txnid_t mvcc_shapshot_oldest(MDBX_env *const env,
const txnid_t steady);
MDBX_INTERNAL pgno_t mvcc_snapshot_largest(const MDBX_env *env,
pgno_t last_used_page);
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env,
const txnid_t straggler);
MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rlocked, int *dead);
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
/* dxb.c */
MDBX_INTERNAL int dxb_setup(MDBX_env *env, const int lck_rc,
const mdbx_mode_t mode_bits);
MDBX_INTERNAL int __must_check_result
dxb_read_header(MDBX_env *env, meta_t *meta, const int lck_exclusive,
const mdbx_mode_t mode_bits);
enum resize_mode { implicit_grow, impilict_shrink, explicit_resize };
MDBX_INTERNAL int __must_check_result dxb_resize(MDBX_env *const env,
const pgno_t used_pgno,
const pgno_t size_pgno,
pgno_t limit_pgno,
const enum resize_mode mode);
MDBX_INTERNAL int dxb_set_readahead(const MDBX_env *env, const pgno_t edge,
const bool enable, const bool force_whole);
MDBX_INTERNAL int __must_check_result dxb_sync_locked(MDBX_env *env,
unsigned flags,
meta_t *const pending,
troika_t *const troika);
#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)
MDBX_INTERNAL void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn);
#else
static inline void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) {
(void)env;
(void)txn;
}
#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */
/* txn.c */
MDBX_INTERNAL bool txn_refund(MDBX_txn *txn);
MDBX_INTERNAL txnid_t txn_snapshot_oldest(const MDBX_txn *const txn);
MDBX_INTERNAL int txn_abort(MDBX_txn *txn);
MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags);
#define TXN_END_NAMES \
{"committed", "empty-commit", "abort", "reset", \
"reset-tmp", "fail-begin", "fail-beginchild"}
enum {
/* txn_end operation number, for logging */
TXN_END_COMMITTED,
TXN_END_PURE_COMMIT,
TXN_END_ABORT,
TXN_END_RESET,
TXN_END_RESET_TMP,
TXN_END_FAIL_BEGIN,
TXN_END_FAIL_BEGINCHILD,
TXN_END_OPMASK = 0x0F /* mask for txn_end() operation number */,
TXN_END_UPDATE = 0x10 /* update env state (DBIs) */,
TXN_END_FREE = 0x20 /* free txn unless it is env.basal_txn */,
TXN_END_EOTDONE = 0x40 /* txn's cursors already closed */,
TXN_END_SLOT = 0x80 /* release any reader slot if NOSTICKYTHREADS */
};
MDBX_INTERNAL int txn_end(MDBX_txn *txn, const unsigned mode);
MDBX_INTERNAL int txn_write(MDBX_txn *txn, iov_ctx_t *ctx);
/* env.c */
MDBX_INTERNAL int env_open(MDBX_env *env, mdbx_mode_t mode);
MDBX_INTERNAL int env_info(const MDBX_env *env, const MDBX_txn *txn,
MDBX_envinfo *out, size_t bytes, troika_t *troika);
MDBX_INTERNAL int env_sync(MDBX_env *env, bool force, bool nonblock);
MDBX_INTERNAL int env_close(MDBX_env *env, bool resurrect_after_fork);
MDBX_INTERNAL bool env_txn0_owned(const MDBX_env *env);
MDBX_INTERNAL void env_options_init(MDBX_env *env);
MDBX_INTERNAL void env_options_adjust_defaults(MDBX_env *env);
MDBX_INTERNAL int __must_check_result env_page_auxbuffer(MDBX_env *env);
MDBX_INTERNAL unsigned env_setup_pagesize(MDBX_env *env, const size_t pagesize);
/* tree.c */
MDBX_INTERNAL int tree_drop(MDBX_cursor *mc, const bool may_have_subDBs);
MDBX_INTERNAL int __must_check_result tree_rebalance(MDBX_cursor *mc);
MDBX_INTERNAL int __must_check_result tree_propagate_key(MDBX_cursor *mc,
const MDBX_val *key);
MDBX_INTERNAL void recalculate_merge_thresholds(MDBX_env *env);
/* subdb.c */
MDBX_INTERNAL int __must_check_result sdb_fetch(MDBX_txn *txn, size_t dbi);
MDBX_INTERNAL int __must_check_result sdb_setup(const MDBX_env *env,
kvx_t *const kvx,
const tree_t *const db);
/* coherency.c */
MDBX_INTERNAL bool coherency_check_meta(const MDBX_env *env,
const volatile meta_t *meta,
bool report);
MDBX_INTERNAL int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head,
uint64_t *timestamp);
MDBX_INTERNAL int coherency_check_written(const MDBX_env *env,
const txnid_t txnid,
const volatile meta_t *meta,
const intptr_t pgno,
uint64_t *timestamp);
MDBX_INTERNAL int coherency_timeout(uint64_t *timestamp, intptr_t pgno,
const MDBX_env *env);

394
src/range-estimate.c Normal file
View File

@ -0,0 +1,394 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
typedef struct diff_result {
ptrdiff_t diff;
intptr_t level;
ptrdiff_t root_nkeys;
} diff_t;
/* calculates: r = x - y */
__hot static int cursor_diff(const MDBX_cursor *const __restrict x,
const MDBX_cursor *const __restrict y,
diff_t *const __restrict r) {
r->diff = 0;
r->level = 0;
r->root_nkeys = 0;
if (unlikely(x->signature != cur_signature_live))
return (x->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
if (unlikely(y->signature != cur_signature_live))
return (y->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = check_txn(x->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(x->txn != y->txn))
return MDBX_BAD_TXN;
if (unlikely(y->dbi_state != x->dbi_state))
return MDBX_EINVAL;
const intptr_t depth = (x->top < y->top) ? x->top : y->top;
if (unlikely(depth < 0))
return MDBX_ENODATA;
r->root_nkeys = page_numkeys(x->pg[0]);
intptr_t nkeys = r->root_nkeys;
for (;;) {
if (unlikely(y->pg[r->level] != x->pg[r->level])) {
ERROR("Mismatch cursors's pages at %zu level", r->level);
return MDBX_PROBLEM;
}
r->diff = x->ki[r->level] - y->ki[r->level];
if (r->diff)
break;
r->level += 1;
if (r->level > depth) {
r->diff = CMP2INT(x->flags & z_eof_hard, y->flags & z_eof_hard);
return MDBX_SUCCESS;
}
nkeys = page_numkeys(x->pg[r->level]);
}
while (unlikely(r->diff == 1) && likely(r->level < depth)) {
r->level += 1;
/* DB'PAGEs: 0------------------>MAX
*
* CURSORs: y < x
* STACK[i ]: |
* STACK[+1]: ...y++N|0++x...
*/
nkeys = page_numkeys(y->pg[r->level]);
r->diff = (nkeys - y->ki[r->level]) + x->ki[r->level];
assert(r->diff > 0);
}
while (unlikely(r->diff == -1) && likely(r->level < depth)) {
r->level += 1;
/* DB'PAGEs: 0------------------>MAX
*
* CURSORs: x < y
* STACK[i ]: |
* STACK[+1]: ...x--N|0--y...
*/
nkeys = page_numkeys(x->pg[r->level]);
r->diff = -(nkeys - x->ki[r->level]) - y->ki[r->level];
assert(r->diff < 0);
}
return MDBX_SUCCESS;
}
__hot static ptrdiff_t estimate(const tree_t *tree,
diff_t *const __restrict dr) {
/* root: branch-page => scale = leaf-factor * branch-factor^(N-1)
* level-1: branch-page(s) => scale = leaf-factor * branch-factor^2
* level-2: branch-page(s) => scale = leaf-factor * branch-factor
* level-N: branch-page(s) => scale = leaf-factor
* leaf-level: leaf-page(s) => scale = 1
*/
ptrdiff_t btree_power = (ptrdiff_t)tree->height - 2 - (ptrdiff_t)dr->level;
if (btree_power < 0)
return dr->diff;
ptrdiff_t estimated =
(ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)tree->leaf_pages;
if (btree_power == 0)
return estimated;
if (tree->height < 4) {
assert(dr->level == 0 && btree_power == 1);
return (ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)dr->root_nkeys;
}
/* average_branchpage_fillfactor = total(branch_entries) / branch_pages
total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */
const size_t log2_fixedpoint = sizeof(size_t) - 1;
const size_t half = UINT64_C(1) << (log2_fixedpoint - 1);
const size_t factor =
((tree->leaf_pages + tree->branch_pages - 1) << log2_fixedpoint) /
tree->branch_pages;
while (1) {
switch ((size_t)btree_power) {
default: {
const size_t square = (factor * factor + half) >> log2_fixedpoint;
const size_t quad = (square * square + half) >> log2_fixedpoint;
do {
estimated = estimated * quad + half;
estimated >>= log2_fixedpoint;
btree_power -= 4;
} while (btree_power >= 4);
continue;
}
case 3:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 2:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 1:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 0:
if (unlikely(estimated > (ptrdiff_t)tree->items))
return (ptrdiff_t)tree->items;
if (unlikely(estimated < -(ptrdiff_t)tree->items))
return -(ptrdiff_t)tree->items;
return estimated;
}
}
}
__hot int mdbx_estimate_distance(const MDBX_cursor *first,
const MDBX_cursor *last,
ptrdiff_t *distance_items) {
if (unlikely(first == nullptr || last == nullptr ||
distance_items == nullptr))
return MDBX_EINVAL;
*distance_items = 0;
diff_t dr;
int rc = cursor_diff(last, first, &dr);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cASSERT(first, dr.diff || inner_pointed(first) == inner_pointed(last));
if (unlikely(dr.diff == 0) && inner_pointed(first)) {
first = &first->subcur->cursor;
last = &last->subcur->cursor;
rc = cursor_diff(first, last, &dr);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
if (likely(dr.diff != 0))
*distance_items = estimate(first->tree, &dr);
return MDBX_SUCCESS;
}
__hot int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key,
MDBX_val *data, MDBX_cursor_op move_op,
ptrdiff_t *distance_items) {
if (unlikely(cursor == nullptr || distance_items == nullptr ||
move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE))
return MDBX_EINVAL;
if (unlikely(cursor->signature != cur_signature_live))
return (cursor->signature == cur_signature_ready4dispose) ? MDBX_EINVAL
: MDBX_EBADSIGN;
int rc = check_txn(cursor->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!is_pointed(cursor)))
return MDBX_ENODATA;
cursor_couple_t next;
rc = cursor_init(&next.outer, cursor->txn, cursor_dbi(cursor));
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cursor_cpstk(cursor, &next.outer);
if (cursor->tree->flags & MDBX_DUPSORT) {
subcur_t *mx = &container_of(cursor, cursor_couple_t, outer)->inner;
cursor_cpstk(&mx->cursor, &next.inner.cursor);
}
MDBX_val stub_data;
if (data == nullptr) {
const unsigned mask =
1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY;
if (unlikely(mask & (1 << move_op)))
return MDBX_EINVAL;
stub_data.iov_base = nullptr;
stub_data.iov_len = 0;
data = &stub_data;
}
MDBX_val stub_key;
if (key == nullptr) {
const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE |
1 << MDBX_SET_KEY | 1 << MDBX_SET |
1 << MDBX_SET_RANGE;
if (unlikely(mask & (1 << move_op)))
return MDBX_EINVAL;
stub_key.iov_base = nullptr;
stub_key.iov_len = 0;
key = &stub_key;
}
next.outer.signature = cur_signature_live;
rc = cursor_ops(&next.outer, key, data, move_op);
if (unlikely(rc != MDBX_SUCCESS &&
(rc != MDBX_NOTFOUND || !is_pointed(&next.outer))))
return rc;
if (move_op == MDBX_LAST) {
next.outer.flags |= z_eof_hard;
next.inner.cursor.flags |= z_eof_hard;
}
return mdbx_estimate_distance(cursor, &next.outer, distance_items);
}
__hot int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi,
const MDBX_val *begin_key,
const MDBX_val *begin_data,
const MDBX_val *end_key, const MDBX_val *end_data,
ptrdiff_t *size_items) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(!size_items))
return MDBX_EINVAL;
if (unlikely(begin_data &&
(begin_key == nullptr || begin_key == MDBX_EPSILON)))
return MDBX_EINVAL;
if (unlikely(end_data && (end_key == nullptr || end_key == MDBX_EPSILON)))
return MDBX_EINVAL;
if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON))
return MDBX_EINVAL;
cursor_couple_t begin;
/* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */
rc = cursor_init(&begin.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(begin.outer.tree->items == 0)) {
*size_items = 0;
return MDBX_SUCCESS;
}
if (!begin_key) {
if (unlikely(!end_key)) {
/* LY: FIRST..LAST case */
*size_items = (ptrdiff_t)begin.outer.tree->items;
return MDBX_SUCCESS;
}
rc = outer_first(&begin.outer, nullptr, nullptr);
if (unlikely(end_key == MDBX_EPSILON)) {
/* LY: FIRST..+epsilon case */
return (rc == MDBX_SUCCESS)
? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
: rc;
}
} else {
if (unlikely(begin_key == MDBX_EPSILON)) {
if (end_key == nullptr) {
/* LY: -epsilon..LAST case */
rc = outer_last(&begin.outer, nullptr, nullptr);
return (rc == MDBX_SUCCESS)
? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
: rc;
}
/* LY: -epsilon..value case */
assert(end_key != MDBX_EPSILON);
begin_key = end_key;
} else if (unlikely(end_key == MDBX_EPSILON)) {
/* LY: value..+epsilon case */
assert(begin_key != MDBX_EPSILON);
end_key = begin_key;
}
if (end_key && !begin_data && !end_data &&
(begin_key == end_key ||
begin.outer.clc->k.cmp(begin_key, end_key) == 0)) {
/* LY: single key case */
rc = cursor_seek(&begin.outer, (MDBX_val *)begin_key, nullptr, MDBX_SET)
.err;
if (unlikely(rc != MDBX_SUCCESS)) {
*size_items = 0;
return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
}
*size_items = 1;
if (inner_pointed(&begin.outer))
*size_items =
(sizeof(*size_items) >= sizeof(begin.inner.nested_tree.items) ||
begin.inner.nested_tree.items <= PTRDIFF_MAX)
? (size_t)begin.inner.nested_tree.items
: PTRDIFF_MAX;
return MDBX_SUCCESS;
} else {
MDBX_val proxy_key = *begin_key;
MDBX_val proxy_data = {nullptr, 0};
if (begin_data)
proxy_data = *begin_data;
rc = cursor_seek(&begin.outer, &proxy_key, &proxy_data,
MDBX_SET_LOWERBOUND)
.err;
}
}
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !is_pointed(&begin.outer))
return rc;
}
cursor_couple_t end;
rc = cursor_init(&end.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (!end_key) {
rc = outer_last(&end.outer, nullptr, nullptr);
end.outer.flags |= z_eof_hard;
end.inner.cursor.flags |= z_eof_hard;
} else {
MDBX_val proxy_key = *end_key;
MDBX_val proxy_data = {nullptr, 0};
if (end_data)
proxy_data = *end_data;
rc = cursor_seek(&end.outer, &proxy_key, &proxy_data, MDBX_SET_LOWERBOUND)
.err;
}
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !is_pointed(&end.outer))
return rc;
}
rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
assert(*size_items >= -(ptrdiff_t)begin.outer.tree->items &&
*size_items <= (ptrdiff_t)begin.outer.tree->items);
#if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation \
* results for an inverted ranges. */
/* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63
Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */
if (*size_items < 0) {
/* LY: inverted range case */
*size_items += (ptrdiff_t)begin.outer.tree->items;
} else if (*size_items == 0 && begin_key && end_key) {
int cmp = begin.outer.kvx->cmp(&origin_begin_key, &origin_end_key);
if (cmp == 0 && cursor_pointed(begin.inner.cursor.flags) &&
begin_data && end_data)
cmp = begin.outer.kvx->v.cmp(&origin_begin_data, &origin_end_data);
if (cmp > 0) {
/* LY: inverted range case with empty scope */
*size_items = (ptrdiff_t)begin.outer.tree->items;
}
}
assert(*size_items >= 0 &&
*size_items <= (ptrdiff_t)begin.outer.tree->items);
#endif
return MDBX_SUCCESS;
}

229
src/refund.c Normal file
View File

@ -0,0 +1,229 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
#if MDBX_ENABLE_REFUND
static void refund_reclaimed(MDBX_txn *txn) {
/* Scanning in descend order */
pgno_t first_unallocated = txn->geo.first_unallocated;
const pnl_t pnl = txn->tw.relist;
tASSERT(txn,
MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == first_unallocated - 1);
#if MDBX_PNL_ASCENDING
size_t i = MDBX_PNL_GETSIZE(pnl);
tASSERT(txn, pnl[i] == first_unallocated - 1);
while (--first_unallocated, --i > 0 && pnl[i] == first_unallocated - 1)
;
MDBX_PNL_SETSIZE(pnl, i);
#else
size_t i = 1;
tASSERT(txn, pnl[i] == first_unallocated - 1);
size_t len = MDBX_PNL_GETSIZE(pnl);
while (--first_unallocated, ++i <= len && pnl[i] == first_unallocated - 1)
;
MDBX_PNL_SETSIZE(pnl, len -= i - 1);
for (size_t move = 0; move < len; ++move)
pnl[1 + move] = pnl[i + move];
#endif
VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO,
txn->geo.first_unallocated - first_unallocated,
txn->geo.first_unallocated, first_unallocated);
txn->geo.first_unallocated = first_unallocated;
tASSERT(txn,
pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - 1));
}
static void refund_loose(MDBX_txn *txn) {
tASSERT(txn, txn->tw.loose_pages != nullptr);
tASSERT(txn, txn->tw.loose_count > 0);
dpl_t *const dl = txn->tw.dirtylist;
if (dl) {
tASSERT(txn, dl->length >= txn->tw.loose_count);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
}
pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
pnl_t suitable = onstack;
if (!dl || dl->length - dl->sorted > txn->tw.loose_count) {
/* Dirty list is useless since unsorted. */
if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) {
suitable = pnl_alloc(txn->tw.loose_count);
if (unlikely(!suitable))
return /* this is not a reason for transaction fail */;
}
/* Collect loose-pages which may be refunded. */
tASSERT(txn,
txn->geo.first_unallocated >= MIN_PAGENO + txn->tw.loose_count);
pgno_t most = MIN_PAGENO;
size_t w = 0;
for (const page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) {
tASSERT(txn, lp->flags == P_LOOSE);
tASSERT(txn, txn->geo.first_unallocated > lp->pgno);
if (likely(txn->geo.first_unallocated - txn->tw.loose_count <=
lp->pgno)) {
tASSERT(txn,
w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack))
: MDBX_PNL_ALLOCLEN(suitable)));
suitable[++w] = lp->pgno;
most = (lp->pgno > most) ? lp->pgno : most;
}
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
}
if (most + 1 == txn->geo.first_unallocated) {
/* Sort suitable list and refund pages at the tail. */
MDBX_PNL_SETSIZE(suitable, w);
pnl_sort(suitable, MAX_PAGENO + 1);
/* Scanning in descend order */
const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1;
const intptr_t begin =
MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(suitable) : 1;
const intptr_t end =
MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_GETSIZE(suitable) + 1;
tASSERT(txn, suitable[begin] >= suitable[end - step]);
tASSERT(txn, most == suitable[begin]);
for (intptr_t i = begin + step; i != end; i += step) {
if (suitable[i] != most - 1)
break;
most -= 1;
}
const size_t refunded = txn->geo.first_unallocated - most;
DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded,
most, txn->geo.first_unallocated);
txn->geo.first_unallocated = most;
txn->tw.loose_count -= refunded;
if (dl) {
txn->tw.dirtyroom += refunded;
dl->pages_including_loose -= refunded;
assert(txn->tw.dirtyroom <= txn->env->options.dp_limit);
/* Filter-out dirty list */
size_t r = 0;
w = 0;
if (dl->sorted) {
do {
if (dl->items[++r].pgno < most) {
if (++w != r)
dl->items[w] = dl->items[r];
}
} while (r < dl->sorted);
dl->sorted = w;
}
while (r < dl->length) {
if (dl->items[++r].pgno < most) {
if (++w != r)
dl->items[w] = dl->items[r];
}
}
dpl_setlen(dl, w);
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom
: txn->env->options.dp_limit));
}
goto unlink_loose;
}
} else {
/* Dirtylist is mostly sorted, just refund loose pages at the end. */
dpl_sort(txn);
tASSERT(txn,
dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno);
tASSERT(txn, dl->sorted == dl->length);
/* Scan dirtylist tail-forward and cutoff suitable pages. */
size_t n;
for (n = dl->length; dl->items[n].pgno == txn->geo.first_unallocated - 1 &&
dl->items[n].ptr->flags == P_LOOSE;
--n) {
tASSERT(txn, n > 0);
page_t *dp = dl->items[n].ptr;
DEBUG("refund-sorted page %" PRIaPGNO, dp->pgno);
tASSERT(txn, dp->pgno == dl->items[n].pgno);
txn->geo.first_unallocated -= 1;
}
dpl_setlen(dl, n);
if (dl->sorted != dl->length) {
const size_t refunded = dl->sorted - dl->length;
dl->sorted = dl->length;
txn->tw.loose_count -= refunded;
txn->tw.dirtyroom += refunded;
dl->pages_including_loose -= refunded;
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
(txn->parent ? txn->parent->tw.dirtyroom
: txn->env->options.dp_limit));
/* Filter-out loose chain & dispose refunded pages. */
unlink_loose:
for (page_t *__restrict *__restrict link = &txn->tw.loose_pages; *link;) {
page_t *dp = *link;
tASSERT(txn, dp->flags == P_LOOSE);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *));
if (txn->geo.first_unallocated > dp->pgno) {
link = &page_next(dp);
} else {
*link = page_next(dp);
if ((txn->flags & MDBX_WRITEMAP) == 0)
page_shadow_release(txn->env, dp, 1);
}
}
}
}
tASSERT(txn, dpl_check(txn));
if (suitable != onstack)
pnl_free(suitable);
txn->tw.loose_refund_wl = txn->geo.first_unallocated;
}
bool txn_refund(MDBX_txn *txn) {
const pgno_t before = txn->geo.first_unallocated;
if (txn->tw.loose_pages &&
txn->tw.loose_refund_wl > txn->geo.first_unallocated)
refund_loose(txn);
while (true) {
if (MDBX_PNL_GETSIZE(txn->tw.relist) == 0 ||
MDBX_PNL_MOST(txn->tw.relist) != txn->geo.first_unallocated - 1)
break;
refund_reclaimed(txn);
if (!txn->tw.loose_pages ||
txn->tw.loose_refund_wl <= txn->geo.first_unallocated)
break;
const pgno_t memo = txn->geo.first_unallocated;
refund_loose(txn);
if (memo == txn->geo.first_unallocated)
break;
}
if (before == txn->geo.first_unallocated)
return false;
if (txn->tw.spilled.list)
/* Squash deleted pagenums if we refunded any */
spill_purge(txn);
return true;
}
#else /* MDBX_ENABLE_REFUND */
bool txn_refund(MDBX_txn *txn) {
(void)txn;
/* No online auto-compactification. */
return false;
}
#endif /* MDBX_ENABLE_REFUND */

485
src/sort.h Normal file
View File

@ -0,0 +1,485 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// \file sort.h
/// \brief Маркосы реализующие сортировку и двоичный поиск
#pragma once
#define MDBX_RADIXSORT_THRESHOLD 142
/* ---------------------------------------------------------------------------
* LY: State of the art quicksort-based sorting, with internal stack
* and network-sort for small chunks.
* Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */
#if MDBX_HAVE_CMOV
#define SORT_CMP_SWAP(TYPE, CMP, a, b) \
do { \
const TYPE swap_tmp = (a); \
const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \
(a) = swap_cmp ? swap_tmp : b; \
(b) = swap_cmp ? b : swap_tmp; \
} while (0)
#else
#define SORT_CMP_SWAP(TYPE, CMP, a, b) \
do \
if (expect_with_probability(!CMP(a, b), 0, .5)) { \
const TYPE swap_tmp = (a); \
(a) = (b); \
(b) = swap_tmp; \
} \
while (0)
#endif
// 3 comparators, 3 parallel operations
// o-----^--^--o
// | |
// o--^--|--v--o
// | |
// o--v--v-----o
//
// [[1,2]]
// [[0,2]]
// [[0,1]]
#define SORT_NETWORK_3(TYPE, CMP, begin) \
do { \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
} while (0)
// 5 comparators, 3 parallel operations
// o--^--^--------o
// | |
// o--v--|--^--^--o
// | | |
// o--^--v--|--v--o
// | |
// o--v-----v-----o
//
// [[0,1],[2,3]]
// [[0,2],[1,3]]
// [[1,2]]
#define SORT_NETWORK_4(TYPE, CMP, begin) \
do { \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
} while (0)
// 9 comparators, 5 parallel operations
// o--^--^-----^-----------o
// | | |
// o--|--|--^--v-----^--^--o
// | | | | |
// o--|--v--|--^--^--|--v--o
// | | | | |
// o--|-----v--|--v--|--^--o
// | | | |
// o--v--------v-----v--v--o
//
// [[0,4],[1,3]]
// [[0,2]]
// [[2,4],[0,1]]
// [[2,3],[1,4]]
// [[1,2],[3,4]]
#define SORT_NETWORK_5(TYPE, CMP, begin) \
do { \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
} while (0)
// 12 comparators, 6 parallel operations
// o-----^--^--^-----------------o
// | | |
// o--^--|--v--|--^--------^-----o
// | | | | |
// o--v--v-----|--|--^--^--|--^--o
// | | | | | |
// o-----^--^--v--|--|--|--v--v--o
// | | | | |
// o--^--|--v-----v--|--v--------o
// | | |
// o--v--v-----------v-----------o
//
// [[1,2],[4,5]]
// [[0,2],[3,5]]
// [[0,1],[3,4],[2,5]]
// [[0,3],[1,4]]
// [[2,4],[1,3]]
// [[2,3]]
#define SORT_NETWORK_6(TYPE, CMP, begin) \
do { \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
} while (0)
// 16 comparators, 6 parallel operations
// o--^--------^-----^-----------------o
// | | |
// o--|--^-----|--^--v--------^--^-----o
// | | | | | |
// o--|--|--^--v--|--^-----^--|--v-----o
// | | | | | | |
// o--|--|--|-----v--|--^--v--|--^--^--o
// | | | | | | | |
// o--v--|--|--^-----v--|--^--v--|--v--o
// | | | | | |
// o-----v--|--|--------v--v-----|--^--o
// | | | |
// o--------v--v-----------------v--v--o
//
// [[0,4],[1,5],[2,6]]
// [[0,2],[1,3],[4,6]]
// [[2,4],[3,5],[0,1]]
// [[2,3],[4,5]]
// [[1,4],[3,6]]
// [[1,2],[3,4],[5,6]]
#define SORT_NETWORK_7(TYPE, CMP, begin) \
do { \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \
} while (0)
// 19 comparators, 6 parallel operations
// o--^--------^-----^-----------------o
// | | |
// o--|--^-----|--^--v--------^--^-----o
// | | | | | |
// o--|--|--^--v--|--^-----^--|--v-----o
// | | | | | | |
// o--|--|--|--^--v--|--^--v--|--^--^--o
// | | | | | | | | |
// o--v--|--|--|--^--v--|--^--v--|--v--o
// | | | | | | |
// o-----v--|--|--|--^--v--v-----|--^--o
// | | | | | |
// o--------v--|--v--|--^--------v--v--o
// | | |
// o-----------v-----v--v--------------o
//
// [[0,4],[1,5],[2,6],[3,7]]
// [[0,2],[1,3],[4,6],[5,7]]
// [[2,4],[3,5],[0,1],[6,7]]
// [[2,3],[4,5]]
// [[1,4],[3,6]]
// [[1,2],[3,4],[5,6]]
#define SORT_NETWORK_8(TYPE, CMP, begin) \
do { \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \
SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \
SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \
SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \
SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \
SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \
SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \
} while (0)
#define SORT_INNER(TYPE, CMP, begin, end, len) \
switch (len) { \
default: \
assert(false); \
__unreachable(); \
case 0: \
case 1: \
break; \
case 2: \
SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \
break; \
case 3: \
SORT_NETWORK_3(TYPE, CMP, begin); \
break; \
case 4: \
SORT_NETWORK_4(TYPE, CMP, begin); \
break; \
case 5: \
SORT_NETWORK_5(TYPE, CMP, begin); \
break; \
case 6: \
SORT_NETWORK_6(TYPE, CMP, begin); \
break; \
case 7: \
SORT_NETWORK_7(TYPE, CMP, begin); \
break; \
case 8: \
SORT_NETWORK_8(TYPE, CMP, begin); \
break; \
}
#define SORT_SWAP(TYPE, a, b) \
do { \
const TYPE swap_tmp = (a); \
(a) = (b); \
(b) = swap_tmp; \
} while (0)
#define SORT_PUSH(low, high) \
do { \
top->lo = (low); \
top->hi = (high); \
++top; \
} while (0)
#define SORT_POP(low, high) \
do { \
--top; \
low = top->lo; \
high = top->hi; \
} while (0)
#define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP) \
\
static inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \
while (++first <= last) \
if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \
return false; \
return true; \
} \
\
typedef struct { \
TYPE *lo, *hi; \
} NAME##_stack; \
\
__hot static void NAME(TYPE *const __restrict begin, \
TYPE *const __restrict end) { \
NAME##_stack stack[sizeof(size_t) * CHAR_BIT], *__restrict top = stack; \
\
TYPE *__restrict hi = end - 1; \
TYPE *__restrict lo = begin; \
while (true) { \
const ptrdiff_t len = hi - lo; \
if (len < 8) { \
SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \
if (unlikely(top == stack)) \
break; \
SORT_POP(lo, hi); \
continue; \
} \
\
TYPE *__restrict mid = lo + (len >> 1); \
SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \
SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \
SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \
\
TYPE *right = hi - 1; \
TYPE *left = lo + 1; \
while (1) { \
while (expect_with_probability(CMP(*left, *mid), 0, .5)) \
++left; \
while (expect_with_probability(CMP(*mid, *right), 0, .5)) \
--right; \
if (unlikely(left > right)) { \
if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \
if (NAME##_is_sorted(lo, right)) \
lo = right + 1; \
if (NAME##_is_sorted(left, hi)) \
hi = left; \
} \
break; \
} \
SORT_SWAP(TYPE, *left, *right); \
mid = (mid == left) ? right : (mid == right) ? left : mid; \
++left; \
--right; \
} \
\
if (right - lo > hi - left) { \
SORT_PUSH(lo, right); \
lo = left; \
} else { \
SORT_PUSH(left, hi); \
hi = right; \
} \
} \
\
if (AUDIT_ENABLED()) { \
for (TYPE *scan = begin + 1; scan < end; ++scan) \
assert(CMP(scan[-1], scan[0])); \
} \
}
/*------------------------------------------------------------------------------
* LY: radix sort for large chunks */
#define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \
\
__hot static bool NAME##_radixsort(TYPE *const begin, const size_t length) { \
TYPE *tmp; \
if (BUFFER_PREALLOCATED) { \
tmp = begin + length + END_GAP; \
/* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \
} else { \
tmp = osal_malloc(sizeof(TYPE) * length); \
if (unlikely(!tmp)) \
return false; \
} \
\
size_t key_shift = 0, key_diff_mask; \
do { \
struct { \
pgno_t a[256], b[256]; \
} counters; \
memset(&counters, 0, sizeof(counters)); \
\
key_diff_mask = 0; \
size_t prev_key = EXTRACT_KEY(begin) >> key_shift; \
TYPE *r = begin, *end = begin + length; \
do { \
const size_t key = EXTRACT_KEY(r) >> key_shift; \
counters.a[key & 255]++; \
counters.b[(key >> 8) & 255]++; \
key_diff_mask |= prev_key ^ key; \
prev_key = key; \
} while (++r != end); \
\
pgno_t ta = 0, tb = 0; \
for (size_t i = 0; i < 256; ++i) { \
const pgno_t ia = counters.a[i]; \
counters.a[i] = ta; \
ta += ia; \
const pgno_t ib = counters.b[i]; \
counters.b[i] = tb; \
tb += ib; \
} \
\
r = begin; \
do { \
const size_t key = EXTRACT_KEY(r) >> key_shift; \
tmp[counters.a[key & 255]++] = *r; \
} while (++r != end); \
\
if (unlikely(key_diff_mask < 256)) { \
memcpy(begin, tmp, ptr_dist(end, begin)); \
break; \
} \
end = (r = tmp) + length; \
do { \
const size_t key = EXTRACT_KEY(r) >> key_shift; \
begin[counters.b[(key >> 8) & 255]++] = *r; \
} while (++r != end); \
\
key_shift += 16; \
} while (key_diff_mask >> 16); \
\
if (!(BUFFER_PREALLOCATED)) \
osal_free(tmp); \
return true; \
}
/*------------------------------------------------------------------------------
* LY: Binary search */
#if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__)
#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \
do \
__asm __volatile("" \
: "+r"(size) \
: "r" /* the `b` constraint is more suitable here, but \
cause CLANG to allocate and push/pop an one more \
register, so using the `r` which avoids this. */ \
(flag)); \
while (0)
#else
#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \
do { \
/* nope for non-clang or non-x86 */; \
} while (0)
#endif /* Workaround for CLANG */
/* *INDENT-OFF* */
/* clang-format off */
#define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \
static __always_inline const TYPE_LIST *NAME( \
const TYPE_LIST *it, size_t length, const TYPE_ARG item) { \
const TYPE_LIST *const begin = it, *const end = begin + length; \
\
if (MDBX_HAVE_CMOV) \
do { \
/* Адаптивно-упрощенный шаг двоичного поиска: \
* - без переходов при наличии cmov или аналога; \
* - допускает лишние итерации; \
* - но ищет пока size > 2, что требует дозавершения поиска \
* среди остающихся 0-1-2 элементов. */ \
const TYPE_LIST *const middle = it + (length >> 1); \
length = (length + 1) >> 1; \
const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \
WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \
it = flag ? middle : it; \
} while (length > 2); \
else \
while (length > 2) { \
/* Вариант с использованием условного перехода. Основное отличие в \
* том, что при "не равно" (true от компаратора) переход делается на 1 \
* ближе к концу массива. Алгоритмически это верно и обеспечивает \
* чуть-чуть более быструю сходимость, но зато требует больше \
* вычислений при true от компаратора. Также ВАЖНО(!) не допускается \
* спекулятивное выполнение при size == 0. */ \
const TYPE_LIST *const middle = it + (length >> 1); \
length = (length + 1) >> 1; \
const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \
if (flag) { \
it = middle + 1; \
length -= 1; \
} \
} \
it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \
it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \
\
if (AUDIT_ENABLED()) { \
for (const TYPE_LIST *scan = begin; scan < it; ++scan) \
assert(CMP(*scan, item)); \
for (const TYPE_LIST *scan = it; scan < end; ++scan) \
assert(!CMP(*scan, item)); \
(void)begin, (void)end; \
} \
\
return it; \
}
/* *INDENT-ON* */
/* clang-format on */

484
src/spill.c Normal file
View File

@ -0,0 +1,484 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) {
tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) &&
txn->tw.spilled.least_removed > 0);
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed)
? idx
: txn->tw.spilled.least_removed;
txn->tw.spilled.list[idx] |= 1;
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
MDBX_PNL_GETSIZE(txn->tw.spilled.list) -
(idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
while (unlikely(npages > 1)) {
const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1;
if (MDBX_PNL_ASCENDING) {
if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) ||
(txn->tw.spilled.list[idx] >> 1) != pgno)
return;
} else {
if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno)
return;
txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed)
? idx
: txn->tw.spilled.least_removed;
}
txn->tw.spilled.list[idx] |= 1;
MDBX_PNL_SETSIZE(txn->tw.spilled.list,
MDBX_PNL_GETSIZE(txn->tw.spilled.list) -
(idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list)));
--npages;
}
}
pnl_t spill_purge(MDBX_txn *txn) {
tASSERT(txn, txn->tw.spilled.least_removed > 0);
const pnl_t sl = txn->tw.spilled.list;
if (txn->tw.spilled.least_removed != INT_MAX) {
size_t len = MDBX_PNL_GETSIZE(sl), r, w;
for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) {
sl[w] = sl[r];
w += 1 - (sl[r] & 1);
}
for (size_t i = 1; i < w; ++i)
tASSERT(txn, (sl[i] & 1) == 0);
MDBX_PNL_SETSIZE(sl, w - 1);
txn->tw.spilled.least_removed = INT_MAX;
} else {
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i)
tASSERT(txn, (sl[i] & 1) == 0);
}
return sl;
}
/*----------------------------------------------------------------------------*/
static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp,
const size_t npages) {
tASSERT(txn, !(txn->flags & MDBX_WRITEMAP));
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.spill.weak += npages;
#endif /* MDBX_ENABLE_PGOP_STAT */
const pgno_t pgno = dp->pgno;
int err = iov_page(txn, ctx, dp, npages);
if (likely(err == MDBX_SUCCESS))
err = spill_append_span(&txn->tw.spilled.list, pgno, npages);
return err;
}
/* Set unspillable LRU-label for dirty pages watched by txn.
* Returns the number of pages marked as unspillable. */
static size_t spill_cursor_keep(const MDBX_txn *const txn,
const MDBX_cursor *mc) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
size_t keep = 0;
while (!is_poor(mc)) {
tASSERT(txn, mc->top >= 0);
const page_t *mp;
intptr_t i = 0;
do {
mp = mc->pg[i];
tASSERT(txn, !is_subpage(mp));
if (is_modifable(txn, mp)) {
size_t const n = dpl_search(txn, mp->pgno);
if (txn->tw.dirtylist->items[n].pgno == mp->pgno &&
/* не считаем дважды */ dpl_age(txn, n)) {
size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr,
-(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
tASSERT(txn, dpl_age(txn, n) == 0);
++keep;
}
}
} while (++i <= mc->top);
tASSERT(txn, is_leaf(mp));
if (!mc->subcur || mc->ki[mc->top] >= page_numkeys(mp))
break;
if (!(node_flags(page_node(mp, mc->ki[mc->top])) & N_SUBDATA))
break;
mc = &mc->subcur->cursor;
}
return keep;
}
static size_t spill_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
dpl_lru_turn(txn);
size_t keep = m0 ? spill_cursor_keep(txn, m0) : 0;
TXN_FOREACH_DBI_ALL(txn, dbi) {
if (F_ISSET(txn->dbi_state[dbi], DBI_DIRTY | DBI_VALID) &&
txn->dbs[dbi].root != P_INVALID)
for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next)
if (mc != m0)
keep += spill_cursor_keep(txn, mc);
}
return keep;
}
/* Returns the spilling priority (0..255) for a dirty page:
* 0 = should be spilled;
* ...
* > 255 = must not be spilled. */
MDBX_NOTHROW_PURE_FUNCTION static unsigned
spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) {
dpl_t *const dl = txn->tw.dirtylist;
const uint32_t age = dpl_age(txn, i);
const size_t npages = dpl_npages(dl, i);
const pgno_t pgno = dl->items[i].pgno;
if (age == 0) {
DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno);
return 256;
}
page_t *const dp = dl->items[i].ptr;
if (dp->flags & (P_LOOSE | P_SPILLED)) {
DEBUG("skip %s %zu page %" PRIaPGNO,
(dp->flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno);
return 256;
}
/* Can't spill twice,
* make sure it's not already in a parent's spill list(s). */
MDBX_txn *parent = txn->parent;
if (parent && (parent->flags & MDBX_TXN_SPILLS)) {
do
if (spill_intersect(parent, pgno, npages)) {
DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno);
dp->flags |= P_SPILLED;
return 256;
}
while ((parent = parent->parent) != nullptr);
}
tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX);
unsigned prio = age * reciprocal >> 24;
tASSERT(txn, prio < 256);
if (likely(npages == 1))
return prio = 256 - prio;
/* make a large/overflow pages be likely to spill */
size_t factor = npages | npages >> 1;
factor |= factor >> 2;
factor |= factor >> 4;
factor |= factor >> 8;
factor |= factor >> 16;
factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157;
factor = (factor < 256) ? 255 - factor : 0;
tASSERT(txn, factor < 256 && factor < (256 - prio));
return prio = (unsigned)factor;
}
static size_t spill_gate(const MDBX_env *env, intptr_t part,
const size_t total) {
const intptr_t spill_min =
env->options.spill_min_denominator
? (total + env->options.spill_min_denominator - 1) /
env->options.spill_min_denominator
: 1;
const intptr_t spill_max =
total - (env->options.spill_max_denominator
? total / env->options.spill_max_denominator
: 0);
part = (part < spill_max) ? part : spill_max;
part = (part > spill_min) ? part : spill_min;
eASSERT(env, part >= 0 && (size_t)part <= total);
return (size_t)part;
}
__cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
const intptr_t wanna_spill_entries,
const intptr_t wanna_spill_npages,
const size_t need) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
int rc = MDBX_SUCCESS;
if (unlikely(txn->tw.loose_count >=
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
: txn->tw.writemap_dirty_npages)))
goto done;
const size_t dirty_entries =
txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1;
const size_t dirty_npages =
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
: txn->tw.writemap_dirty_npages) -
txn->tw.loose_count;
const size_t need_spill_entries =
spill_gate(txn->env, wanna_spill_entries, dirty_entries);
const size_t need_spill_npages =
spill_gate(txn->env, wanna_spill_npages, dirty_npages);
const size_t need_spill = (need_spill_entries > need_spill_npages)
? need_spill_entries
: need_spill_npages;
if (!need_spill)
goto done;
if (txn->flags & MDBX_WRITEMAP) {
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync",
dirty_entries, dirty_npages);
const MDBX_env *env = txn->env;
tASSERT(txn, txn->tw.spilled.list == nullptr);
rc = osal_msync(&txn->env->dxb_mmap, 0,
pgno_align2os_bytes(env, txn->geo.first_unallocated),
MDBX_SYNC_KICK);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
#if MDBX_AVOID_MSYNC
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
tASSERT(txn, dpl_check(txn));
env->lck->unsynced_pages.weak +=
txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count;
dpl_clear(txn->tw.dirtylist);
txn->tw.dirtyroom = env->options.dp_limit - txn->tw.loose_count;
for (page_t *lp = txn->tw.loose_pages; lp != nullptr; lp = page_next(lp)) {
tASSERT(txn, lp->flags == P_LOOSE);
rc = dpl_append(txn, lp->pgno, lp, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
}
tASSERT(txn, dpl_check(txn));
#else
tASSERT(txn, txn->tw.dirtylist == nullptr);
env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages;
txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages;
txn->tw.writemap_dirty_npages = 0;
#endif /* MDBX_AVOID_MSYNC */
goto done;
}
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write",
need_spill_entries, need_spill_npages);
MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr);
tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1);
tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >=
need_spill_npages);
if (!txn->tw.spilled.list) {
txn->tw.spilled.least_removed = INT_MAX;
txn->tw.spilled.list = pnl_alloc(need_spill);
if (unlikely(!txn->tw.spilled.list)) {
rc = MDBX_ENOMEM;
bailout:
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
} else {
/* purge deleted slots */
spill_purge(txn);
rc = pnl_reserve(&txn->tw.spilled.list, need_spill);
(void)rc /* ignore since the resulting list may be shorter
and pnl_append() will increase pnl on demand */
;
}
/* Сортируем чтобы запись на диск была полее последовательна */
dpl_t *const dl = dpl_sort(txn);
/* Preserve pages which may soon be dirtied again */
const size_t unspillable = spill_txn_keep(txn, m0);
if (unspillable + txn->tw.loose_count >= dl->length) {
#if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */
if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
return MDBX_SUCCESS;
#endif /* xMDBX_DEBUG_SPILLING */
ERROR("all %zu dirty pages are unspillable since referenced "
"by a cursor(s), use fewer cursors or increase "
"MDBX_opt_txn_dp_limit",
unspillable);
goto done;
}
/* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU,
* но при этом учесть важные поправки:
* - лучше выталкивать старые large/overflow страницы, так будет освобождено
* больше памяти, а также так как они (в текущем понимании) гораздо реже
* повторно изменяются;
* - при прочих равных лучше выталкивать смежные страницы, так будет
* меньше I/O операций;
* - желательно потратить на это меньше времени чем std::partial_sort_copy;
*
* Решение:
* - Квантуем весь диапазон lru-меток до 256 значений и задействуем один
* проход 8-битного radix-sort. В результате получаем 256 уровней
* "свежести", в том числе значение lru-метки, старее которой страницы
* должны быть выгружены;
* - Двигаемся последовательно в сторону увеличения номеров страниц
* и выталкиваем страницы с lru-меткой старее отсекающего значения,
* пока не вытолкнем достаточно;
* - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва
* I/O операций выталкиваем и их, если они попадают в первую половину
* между выталкиваемыми и самыми свежими lru-метками;
* - дополнительно при сортировке умышленно старим large/overflow страницы,
* тем самым повышая их шансы на выталкивание. */
/* get min/max of LRU-labels */
uint32_t age_max = 0;
for (size_t i = 1; i <= dl->length; ++i) {
const uint32_t age = dpl_age(txn, i);
age_max = (age_max >= age) ? age_max : age;
}
VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
/* half of 8-bit radix-sort */
pgno_t radix_entries[256], radix_npages[256];
memset(&radix_entries, 0, sizeof(radix_entries));
memset(&radix_npages, 0, sizeof(radix_npages));
size_t spillable_entries = 0, spillable_npages = 0;
const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1);
for (size_t i = 1; i <= dl->length; ++i) {
const unsigned prio = spill_prio(txn, i, reciprocal);
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
TRACE("page %" PRIaPGNO
", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u",
dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N',
dpl_npages(dl, i), dpl_age(txn, i), age_max, prio);
if (prio < 256) {
radix_entries[prio] += 1;
spillable_entries += 1;
const pgno_t npages = dpl_npages(dl, i);
radix_npages[prio] += npages;
spillable_npages += npages;
}
}
tASSERT(txn, spillable_npages >= spillable_entries);
pgno_t spilled_entries = 0, spilled_npages = 0;
if (likely(spillable_entries > 0)) {
size_t prio2spill = 0, prio2adjacent = 128,
amount_entries = radix_entries[0], amount_npages = radix_npages[0];
for (size_t i = 1; i < 256; i++) {
if (amount_entries < need_spill_entries ||
amount_npages < need_spill_npages) {
prio2spill = i;
prio2adjacent = i + (257 - i) / 2;
amount_entries += radix_entries[i];
amount_npages += radix_npages[i];
} else if (amount_entries + amount_entries <
spillable_entries + need_spill_entries
/* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */
|| amount_npages + amount_npages <
spillable_npages + need_spill_npages) {
prio2adjacent = i;
amount_entries += radix_entries[i];
amount_npages += radix_npages[i];
} else
break;
}
VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu,"
" wanna-spill %zu/%zu, amount %zu/%zu",
prio2spill, prio2adjacent, spillable_entries, spillable_npages,
need_spill_entries, need_spill_npages, amount_entries,
amount_npages);
tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
iov_ctx_t ctx;
rc = iov_init(
txn, &ctx, amount_entries, amount_npages,
#if defined(_WIN32) || defined(_WIN64)
txn->env->ioring.overlapped_fd ? txn->env->ioring.overlapped_fd :
#endif
txn->env->lazy_fd,
true);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
size_t r = 0, w = 0;
pgno_t last = 0;
while (r < dl->length && (spilled_entries < need_spill_entries ||
spilled_npages < need_spill_npages)) {
dl->items[++w] = dl->items[++r];
unsigned prio = spill_prio(txn, w, reciprocal);
if (prio > prio2spill &&
(prio >= prio2adjacent || last != dl->items[w].pgno))
continue;
const size_t e = w;
last = dpl_endpgno(dl, w);
while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno &&
spill_prio(txn, w, reciprocal) < prio2adjacent)
;
for (size_t i = w; ++i <= e;) {
const unsigned npages = dpl_npages(dl, i);
prio = spill_prio(txn, i, reciprocal);
DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)",
(prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno,
dpl_age(txn, i), prio);
tASSERT(txn, prio < 256);
++spilled_entries;
spilled_npages += npages;
rc = spill_page(txn, &ctx, dl->items[i].ptr, npages);
if (unlikely(rc != MDBX_SUCCESS))
goto failed;
}
}
VERBOSE("spilled entries %u, spilled npages %u", spilled_entries,
spilled_npages);
tASSERT(txn, spillable_entries == 0 || spilled_entries > 0);
tASSERT(txn, spilled_npages >= spilled_entries);
failed:
while (r < dl->length)
dl->items[++w] = dl->items[++r];
tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS);
dl->sorted = dpl_setlen(dl, w);
txn->tw.dirtyroom += spilled_entries;
txn->tw.dirtylist->pages_including_loose -= spilled_npages;
tASSERT(txn, dpl_check(txn));
if (!iov_empty(&ctx)) {
tASSERT(txn, rc == MDBX_SUCCESS);
rc = iov_write(&ctx);
}
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
txn->env->lck->unsynced_pages.weak += spilled_npages;
pnl_sort(txn->tw.spilled.list, (size_t)txn->geo.first_unallocated << 1);
txn->flags |= MDBX_TXN_SPILLS;
NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room",
spilled_entries, spilled_npages, txn->tw.dirtyroom);
} else {
tASSERT(txn, rc == MDBX_SUCCESS);
for (size_t i = 1; i <= dl->length; ++i) {
page_t *dp = dl->items[i].ptr;
VERBOSE(
"unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u",
i, dp->pgno, dpl_npages(dl, i), dp->flags, dpl_age(txn, i),
spill_prio(txn, i, reciprocal));
}
}
#if xMDBX_DEBUG_SPILLING == 2
if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1)
ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; "
"needed %zu, spillable %zu; "
"spilled %u dirty-entries, now have %zu dirty-room",
dl->length + spilled_entries, dl->length,
(txn->parent && txn->parent->tw.dirtylist)
? (intptr_t)txn->parent->tw.dirtylist->length
: -1,
txn->tw.loose_count, need, spillable_entries, spilled_entries,
txn->tw.dirtyroom);
ENSURE(txn->env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2);
#endif /* xMDBX_DEBUG_SPILLING */
done:
return likely(txn->tw.dirtyroom + txn->tw.loose_count >
((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need))
? MDBX_SUCCESS
: MDBX_TXN_FULL;
}

86
src/spill.h Normal file
View File

@ -0,0 +1,86 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
MDBX_INTERNAL void spill_remove(MDBX_txn *txn, size_t idx, size_t npages);
MDBX_INTERNAL pnl_t spill_purge(MDBX_txn *txn);
MDBX_INTERNAL int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
const intptr_t wanna_spill_entries,
const intptr_t wanna_spill_npages,
const size_t need);
/*----------------------------------------------------------------------------*/
static inline size_t spill_search(const MDBX_txn *txn, pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const pnl_t pnl = txn->tw.spilled.list;
if (likely(!pnl))
return 0;
pgno <<= 1;
size_t n = pnl_search(pnl, pgno, (size_t)MAX_PAGENO + MAX_PAGENO + 1);
return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0;
}
static inline bool spill_intersect(const MDBX_txn *txn, pgno_t pgno,
size_t npages) {
const pnl_t pnl = txn->tw.spilled.list;
if (likely(!pnl))
return false;
const size_t len = MDBX_PNL_GETSIZE(pnl);
if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
DEBUG_EXTRA("PNL len %zu [", len);
for (size_t i = 1; i <= len; ++i)
DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1)
: (long)(pnl[i] >> 1));
DEBUG_EXTRA_PRINT("%s\n", "]");
}
const pgno_t spilled_range_begin = pgno << 1;
const pgno_t spilled_range_last = ((pgno + (pgno_t)npages) << 1) - 1;
#if MDBX_PNL_ASCENDING
const size_t n =
pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1);
tASSERT(txn, n && (n == MDBX_PNL_GETSIZE(pnl) + 1 ||
spilled_range_begin <= pnl[n]));
const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last;
#else
const size_t n =
pnl_search(pnl, spilled_range_last, (size_t)MAX_PAGENO + MAX_PAGENO + 1);
tASSERT(txn, n && (n == MDBX_PNL_GETSIZE(pnl) + 1 ||
spilled_range_last >= pnl[n]));
const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin;
#endif
if (ASSERT_ENABLED()) {
bool check = false;
for (size_t i = 0; i < npages; ++i)
check |= spill_search(txn, (pgno_t)(pgno + i)) != 0;
tASSERT(txn, check == rc);
}
return rc;
}
static inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
const size_t need) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, !m0 || cursor_is_tracked(m0));
const intptr_t wanna_spill_entries =
txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0;
const intptr_t wanna_spill_npages =
need +
(txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
: txn->tw.writemap_dirty_npages) -
txn->tw.loose_count - txn->env->options.dp_limit;
/* production mode */
if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1)
#if xMDBX_DEBUG_SPILLING == 1
/* debug mode: always try to spill if xMDBX_DEBUG_SPILLING == 1 */
&& txn->txnid % 23 > 11
#endif
)
return MDBX_SUCCESS;
return spill_slowpath(txn, m0, wanna_spill_entries, wanna_spill_npages, need);
}

104
src/subdb.c Normal file
View File

@ -0,0 +1,104 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
int sdb_setup(const MDBX_env *env, kvx_t *const kvx, const tree_t *const db) {
if (unlikely(!check_sdb_flags(db->flags))) {
ERROR("incompatible or invalid db.flags (0x%x) ", db->flags);
return MDBX_INCOMPATIBLE;
}
if (unlikely(!kvx->clc.k.cmp)) {
kvx->clc.k.cmp = builtin_keycmp(db->flags);
kvx->clc.v.cmp = builtin_datacmp(db->flags);
}
kvx->clc.k.lmin = keysize_min(db->flags);
kvx->clc.k.lmax = env_keysize_max(env, db->flags);
kvx->clc.v.lmin = valsize_min(db->flags);
kvx->clc.v.lmax = env_valsize_max(env, db->flags);
if ((db->flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->dupfix_size) {
if (!MDBX_DISABLE_VALIDATION &&
unlikely(db->dupfix_size < kvx->clc.v.lmin ||
db->dupfix_size > kvx->clc.v.lmax)) {
ERROR("db.dupfix_size (%u) <> min/max value-length (%zu/%zu)",
db->dupfix_size, kvx->clc.v.lmin, kvx->clc.v.lmax);
return MDBX_CORRUPTED;
}
kvx->clc.v.lmin = kvx->clc.v.lmax = db->dupfix_size;
}
return MDBX_SUCCESS;
}
int sdb_fetch(MDBX_txn *txn, size_t dbi) {
cursor_couple_t couple;
int rc = cursor_init(&couple.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
kvx_t *const kvx = &txn->env->kvs[dbi];
rc = tree_search(&couple.outer, &kvx->name, 0);
if (unlikely(rc != MDBX_SUCCESS)) {
bailout:
NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN
" (err %d)",
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
txn->txnid, rc);
return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc;
}
MDBX_val data;
struct node_search_result nsr = node_search(&couple.outer, &kvx->name);
if (unlikely(!nsr.exact)) {
rc = MDBX_NOTFOUND;
goto bailout;
}
if (unlikely((node_flags(nsr.node) & (N_DUPDATA | N_SUBDATA)) != N_SUBDATA)) {
NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)",
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
txn->txnid, "wrong flags");
return MDBX_INCOMPATIBLE; /* not a named DB */
}
rc = node_read(&couple.outer, nsr.node, &data,
couple.outer.pg[couple.outer.top]);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(data.iov_len != sizeof(tree_t))) {
NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)",
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
txn->txnid, "wrong rec-size");
return MDBX_INCOMPATIBLE; /* not a named DB */
}
uint16_t flags = UNALIGNED_PEEK_16(data.iov_base, tree_t, flags);
/* The txn may not know this DBI, or another process may
* have dropped and recreated the DB with other flags. */
tree_t *const db = &txn->dbs[dbi];
if (unlikely((db->flags & DB_PERSISTENT_FLAGS) != flags)) {
NOTICE("dbi %zu refs to the re-created subDB `%*s` for txn %" PRIaTXN
" with different flags (present 0x%X != wanna 0x%X)",
dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base,
txn->txnid, db->flags & DB_PERSISTENT_FLAGS, flags);
return MDBX_INCOMPATIBLE;
}
memcpy(db, data.iov_base, sizeof(tree_t));
#if !MDBX_DISABLE_VALIDATION
const txnid_t pp_txnid = couple.outer.pg[couple.outer.top]->txnid;
tASSERT(txn, txn->front_txnid >= pp_txnid);
if (unlikely(db->mod_txnid > pp_txnid)) {
ERROR("db.mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")",
db->mod_txnid, pp_txnid);
return MDBX_CORRUPTED;
}
#endif /* !MDBX_DISABLE_VALIDATION */
rc = sdb_setup(txn->env, kvx, db);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
txn->dbi_state[dbi] &= ~DBI_STALE;
return MDBX_SUCCESS;
}

610
src/tls.c Normal file
View File

@ -0,0 +1,610 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
typedef struct rthc_entry {
MDBX_env *env;
} rthc_entry_t;
#if MDBX_DEBUG
#define RTHC_INITIAL_LIMIT 1
#else
#define RTHC_INITIAL_LIMIT 16
#endif
static unsigned rthc_count, rthc_limit = RTHC_INITIAL_LIMIT;
static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT];
static rthc_entry_t *rthc_table = rthc_table_static;
static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) {
int rc;
uint64_t bait;
lck_t *const pending_lck = pending->lck;
lck_t *const scan_lck = scan->lck;
if (pending_lck) {
bait = atomic_load64(&pending_lck->bait_uniqueness, mo_AcquireRelease);
rc = MDBX_SUCCESS;
} else {
bait = 0 /* hush MSVC warning */;
rc = osal_msync(scan, 0, sizeof(lck_t), MDBX_SYNC_DATA);
if (rc == MDBX_SUCCESS)
rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->bait_uniqueness),
offsetof(lck_t, bait_uniqueness));
}
if (likely(rc == MDBX_SUCCESS) &&
bait == atomic_load64(&scan_lck->bait_uniqueness, mo_AcquireRelease))
rc = MDBX_RESULT_TRUE;
TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d",
pending_lck ? "mem" : "file", bait,
(rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc);
return rc;
}
static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan,
uint64_t *abra) {
if (*abra == 0) {
const uintptr_t tid = osal_thread_self();
uintptr_t uit = 0;
memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit));
*abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit);
}
const uint64_t cadabra =
rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid())
<< 24 |
*abra >> 40;
lck_t *const scan_lck = scan->lck;
atomic_store64(&scan_lck->bait_uniqueness, cadabra, mo_AcquireRelease);
*abra = *abra * UINT64_C(6364136223846793005) + 1;
return uniq_peek(pending, scan);
}
__cold int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found) {
*found = nullptr;
uint64_t salt = 0;
for (size_t i = 0; i < rthc_count; ++i) {
MDBX_env *const scan = rthc_table[i].env;
if (!scan->lck_mmap.lck || &scan->lck_mmap == pending)
continue;
int err =
atomic_load64(&scan->lck_mmap.lck->bait_uniqueness, mo_AcquireRelease)
? uniq_peek(pending, &scan->lck_mmap)
: uniq_poke(pending, &scan->lck_mmap, &salt);
if (err == MDBX_ENODATA) {
uint64_t length = 0;
if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS &&
length == 0)) {
/* LY: skip checking since LCK-file is empty, i.e. just created. */
DEBUG("%s", "unique (new/empty lck)");
return MDBX_SUCCESS;
}
}
if (err == MDBX_RESULT_TRUE)
err = uniq_poke(pending, &scan->lck_mmap, &salt);
if (err == MDBX_RESULT_TRUE) {
(void)osal_msync(&scan->lck_mmap, 0, sizeof(lck_t), MDBX_SYNC_KICK);
err = uniq_poke(pending, &scan->lck_mmap, &salt);
}
if (err == MDBX_RESULT_TRUE) {
err = uniq_poke(pending, &scan->lck_mmap, &salt);
*found = scan;
DEBUG("found %p", __Wpedantic_format_voidptr(*found));
return MDBX_SUCCESS;
}
if (unlikely(err != MDBX_SUCCESS)) {
DEBUG("failed rc %d", err);
return err;
}
}
DEBUG("%s", "unique");
return MDBX_SUCCESS;
}
//------------------------------------------------------------------------------
#if defined(_WIN32) || defined(_WIN64)
static CRITICAL_SECTION rthc_critical_section;
#else
static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER;
static osal_thread_key_t rthc_key;
static mdbx_atomic_uint32_t rthc_pending;
static inline uint64_t rthc_signature(const void *addr, uint8_t kind) {
uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^
UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return salt << 8 | kind;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return (uint64_t)kind << 56 | salt >> 8;
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
}
#define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D)
#define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0)
static __thread uint64_t rthc_thread_state
#if __has_attribute(tls_model) && \
(defined(__PIC__) || defined(__pic__) || MDBX_BUILD_SHARED_LIBRARY)
__attribute__((tls_model("local-dynamic")))
#endif
;
#if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \
!defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS)
/* Avoid ASAN-trap due the target TLS-variable feed by Darwin's tlv_free() */
#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS \
__attribute__((__no_sanitize_address__, __noinline__))
#else
#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS inline
#endif
MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t rthc_read(const void *rthc) {
return *(volatile uint64_t *)rthc;
}
MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t
rthc_compare_and_clean(const void *rthc, const uint64_t signature) {
#if MDBX_64BIT_CAS
return atomic_cas64((mdbx_atomic_uint64_t *)rthc, signature, 0);
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return atomic_cas32((mdbx_atomic_uint32_t *)rthc, (uint32_t)signature, 0);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return atomic_cas32((mdbx_atomic_uint32_t *)rthc, (uint32_t)(signature >> 32),
0);
#else
#error "FIXME: Unsupported byte order"
#endif
}
static inline int rthc_atexit(void (*dtor)(void *), void *obj,
void *dso_symbol) {
#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL
#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \
defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \
defined(BIONIC)
#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1
#else
#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0
#endif
#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL */
#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT
#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT) || \
defined(HAVE___CXA_THREAD_ATEXIT)
#define MDBX_HAVE_CXA_THREAD_ATEXIT 1
#elif !MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && \
(defined(__linux__) || defined(__gnu_linux__))
#define MDBX_HAVE_CXA_THREAD_ATEXIT 1
#else
#define MDBX_HAVE_CXA_THREAD_ATEXIT 0
#endif
#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT */
int rc = MDBX_ENOSYS;
#if MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && !MDBX_HAVE_CXA_THREAD_ATEXIT
#define __cxa_thread_atexit __cxa_thread_atexit_impl
#endif
#if MDBX_HAVE_CXA_THREAD_ATEXIT || defined(__cxa_thread_atexit)
extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj,
void *dso_symbol) MDBX_WEAK_IMPORT_ATTRIBUTE;
if (&__cxa_thread_atexit)
rc = __cxa_thread_atexit(dtor, obj, dso_symbol);
#elif defined(__APPLE__) || defined(_DARWIN_C_SOURCE)
extern void _tlv_atexit(void (*termfunc)(void *objAddr), void *objAddr)
MDBX_WEAK_IMPORT_ATTRIBUTE;
if (&_tlv_atexit) {
(void)dso_symbol;
_tlv_atexit(dtor, obj);
rc = 0;
}
#else
(void)dtor;
(void)obj;
(void)dso_symbol;
#endif
return rc;
}
__cold void workaround_glibc_bug21031(void) {
/* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031
*
* Due race between pthread_key_delete() and __nptl_deallocate_tsd()
* The destructor(s) of thread-local-storage object(s) may be running
* in another thread(s) and be blocked or not finished yet.
* In such case we get a SEGFAULT after unload this library DSO.
*
* So just by yielding a few timeslices we give a chance
* to such destructor(s) for completion and avoids segfault. */
sched_yield();
sched_yield();
sched_yield();
}
#endif /* !Windows */
void rthc_lock(void) {
#if defined(_WIN32) || defined(_WIN64)
EnterCriticalSection(&rthc_critical_section);
#else
ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0);
#endif
}
void rthc_unlock(void) {
#if defined(_WIN32) || defined(_WIN64)
LeaveCriticalSection(&rthc_critical_section);
#else
ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0);
#endif
}
static inline int thread_key_create(osal_thread_key_t *key) {
int rc;
#if defined(_WIN32) || defined(_WIN64)
*key = TlsAlloc();
rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError();
#else
rc = pthread_key_create(key, nullptr);
#endif
TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key),
(uintptr_t)*key, rc);
return rc;
}
void thread_rthc_set(osal_thread_key_t key, const void *value) {
#if defined(_WIN32) || defined(_WIN64)
ENSURE(nullptr, TlsSetValue(key, (void *)value));
#else
const uint64_t sign_registered =
MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state);
const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(&rthc_thread_state);
if (value && unlikely(rthc_thread_state != sign_registered &&
rthc_thread_state != sign_counted)) {
rthc_thread_state = sign_registered;
TRACE("thread registered 0x%" PRIxPTR, osal_thread_self());
if (rthc_atexit(rthc_thread_dtor, &rthc_thread_state,
(void *)&mdbx_version /* dso_anchor */)) {
ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0);
rthc_thread_state = sign_counted;
const unsigned count_before = atomic_add32(&rthc_pending, 1);
ENSURE(nullptr, count_before < INT_MAX);
NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u",
(uintptr_t)rthc_key, count_before);
(void)count_before;
}
}
ENSURE(nullptr, pthread_setspecific(key, value) == 0);
#endif
}
/* dtor called for thread, i.e. for all mdbx's environment objects */
__cold void rthc_thread_dtor(void *rthc) {
rthc_lock();
const uint32_t current_pid = osal_getpid();
#if defined(_WIN32) || defined(_WIN64)
TRACE(">> pid %d, thread 0x%" PRIxPTR ", module %p", current_pid,
osal_thread_self(), rthc);
#else
TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", current_pid,
osal_thread_self(), rthc);
#endif
for (size_t i = 0; i < rthc_count; ++i) {
MDBX_env *const env = rthc_table[i].env;
if (env->pid != current_pid)
continue;
if (!(env->flags & ENV_TXKEY))
continue;
reader_slot_t *const reader = thread_rthc_get(env->me_txkey);
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
if (reader < begin || reader >= end)
continue;
#if !defined(_WIN32) && !defined(_WIN64)
if (pthread_setspecific(env->me_txkey, nullptr) != 0) {
TRACE("== thread 0x%" PRIxPTR
", rthc %p: ignore race with tsd-key deletion",
osal_thread_self(), __Wpedantic_format_voidptr(reader));
continue /* ignore race with tsd-key deletion by mdbx_env_close() */;
}
#endif
TRACE("== thread 0x%" PRIxPTR
", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, "
"current-pid %i",
osal_thread_self(), __Wpedantic_format_voidptr(reader), i,
__Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end),
(int)(reader - begin), reader->pid.weak, current_pid);
if (atomic_load32(&reader->pid, mo_Relaxed) == current_pid) {
TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(),
__Wpedantic_format_voidptr(reader));
(void)atomic_cas32(&reader->pid, current_pid, 0);
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
}
}
#if defined(_WIN32) || defined(_WIN64)
TRACE("<< thread 0x%" PRIxPTR ", module %p", osal_thread_self(), rthc);
rthc_unlock();
#else
const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc);
const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc);
const uint64_t state = rthc_read(rthc);
if (state == sign_registered &&
rthc_compare_and_clean(rthc, sign_registered)) {
TRACE("== thread 0x%" PRIxPTR
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
osal_thread_self(), rthc, osal_getpid(), "registered", state);
} else if (state == sign_counted &&
rthc_compare_and_clean(rthc, sign_counted)) {
TRACE("== thread 0x%" PRIxPTR
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
osal_thread_self(), rthc, osal_getpid(), "counted", state);
ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
} else {
WARNING("thread 0x%" PRIxPTR
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
osal_thread_self(), rthc, osal_getpid(), "wrong", state);
}
if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) {
TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(),
rthc, osal_getpid());
ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0);
}
TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc);
/* Allow tail call optimization, i.e. gcc should generate the jmp instruction
* instead of a call for pthread_mutex_unlock() and therefore CPU could not
* return to current DSO's code section, which may be unloaded immediately
* after the mutex got released. */
pthread_mutex_unlock(&rthc_mutex);
#endif
}
__cold int rthc_register(MDBX_env *const env) {
TRACE(">> env %p, rthc_count %u, rthc_limit %u",
__Wpedantic_format_voidptr(env), rthc_count, rthc_limit);
int rc = MDBX_SUCCESS;
for (size_t i = 0; i < rthc_count; ++i)
if (unlikely(rthc_table[i].env == env)) {
rc = MDBX_PANIC;
goto bailout;
}
env->me_txkey = 0;
if (unlikely(rthc_count == rthc_limit)) {
rthc_entry_t *new_table =
osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table,
sizeof(rthc_entry_t) * rthc_limit * 2);
if (unlikely(new_table == nullptr)) {
rc = MDBX_ENOMEM;
goto bailout;
}
if (rthc_table == rthc_table_static)
memcpy(new_table, rthc_table, sizeof(rthc_entry_t) * rthc_limit);
rthc_table = new_table;
rthc_limit *= 2;
}
if ((env->flags & MDBX_NOSTICKYTHREADS) == 0) {
rc = thread_key_create(&env->me_txkey);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
env->flags |= ENV_TXKEY;
}
rthc_table[rthc_count].env = env;
TRACE("== [%i] = env %p, key %" PRIuPTR, rthc_count,
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey);
++rthc_count;
bailout:
TRACE("<< env %p, key %" PRIuPTR ", rthc_count %u, rthc_limit %u, rc %d",
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count,
rthc_limit, rc);
return rc;
}
__cold static int rthc_drown(MDBX_env *const env) {
const uint32_t current_pid = osal_getpid();
int rc = MDBX_SUCCESS;
MDBX_env *inprocess_neighbor = nullptr;
if (likely(env->lck_mmap.lck && current_pid == env->pid)) {
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d",
(current_pid == env->pid) ? "cleanup" : "skip",
__Wpedantic_format_voidptr(env), env->pid,
__Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end),
current_pid);
bool cleaned = false;
for (reader_slot_t *r = begin; r < end; ++r) {
if (atomic_load32(&r->pid, mo_Relaxed) == current_pid) {
atomic_store32(&r->pid, 0, mo_AcquireRelease);
TRACE("== cleanup %p", __Wpedantic_format_voidptr(r));
cleaned = true;
}
}
if (cleaned)
atomic_store32(&env->lck_mmap.lck->rdt_refresh_flag, true, mo_Relaxed);
rc = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor);
if (!inprocess_neighbor && env->registered_reader_pid &&
env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
int err = lck_rpid_clear(env);
rc = rc ? rc : err;
}
}
int err = lck_destroy(env, inprocess_neighbor, current_pid);
env->pid = 0;
return rc ? rc : err;
}
__cold int rthc_remove(MDBX_env *const env) {
TRACE(">>> env %p, key %zu, rthc_count %u, rthc_limit %u",
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count,
rthc_limit);
int rc = MDBX_SUCCESS;
if (likely(env->pid))
rc = rthc_drown(env);
for (size_t i = 0; i < rthc_count; ++i) {
if (rthc_table[i].env == env) {
if (--rthc_count > 0)
rthc_table[i] = rthc_table[rthc_count];
else if (rthc_table != rthc_table_static) {
void *tmp = rthc_table;
rthc_table = rthc_table_static;
rthc_limit = RTHC_INITIAL_LIMIT;
osal_memory_barrier();
osal_free(tmp);
}
break;
}
}
TRACE("<<< %p, key %zu, rthc_count %u, rthc_limit %u",
__Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count,
rthc_limit);
return rc;
}
#if !defined(_WIN32) && !defined(_WIN64)
__cold void rthc_afterfork(void) {
NOTICE("drown %d rthc entries", rthc_count);
for (size_t i = 0; i < rthc_count; ++i) {
MDBX_env *const env = rthc_table[i].env;
NOTICE("drown env %p", __Wpedantic_format_voidptr(env));
if (env->lck_mmap.lck)
osal_munmap(&env->lck_mmap);
if (env->dxb_mmap.base) {
osal_munmap(&env->dxb_mmap);
#ifdef ENABLE_MEMCHECK
VALGRIND_DISCARD(env->valgrind_handle);
env->valgrind_handle = -1;
#endif /* ENABLE_MEMCHECK */
}
env->lck = lckless_stub(env);
rthc_drown(env);
}
if (rthc_table != rthc_table_static)
osal_free(rthc_table);
rthc_count = 0;
rthc_table = rthc_table_static;
rthc_limit = RTHC_INITIAL_LIMIT;
rthc_pending.weak = 0;
}
#endif /* ! Windows */
__cold void rthc_ctor(void) {
#if defined(_WIN32) || defined(_WIN64)
InitializeCriticalSection(&rthc_critical_section);
#else
ENSURE(nullptr, pthread_atfork(nullptr, nullptr, rthc_afterfork) == 0);
ENSURE(nullptr, pthread_key_create(&rthc_key, rthc_thread_dtor) == 0);
TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(),
__Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key);
#endif
}
__cold void rthc_dtor(const uint32_t current_pid) {
rthc_lock();
#if !defined(_WIN32) && !defined(_WIN64)
uint64_t *rthc = pthread_getspecific(rthc_key);
TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64
", left %d",
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
rthc ? rthc_read(rthc) : ~UINT64_C(0),
atomic_load32(&rthc_pending, mo_Relaxed));
if (rthc) {
const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc);
const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc);
const uint64_t state = rthc_read(rthc);
if (state == sign_registered &&
rthc_compare_and_clean(rthc, sign_registered)) {
TRACE("== thread 0x%" PRIxPTR
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
"registered", state);
} else if (state == sign_counted &&
rthc_compare_and_clean(rthc, sign_counted)) {
TRACE("== thread 0x%" PRIxPTR
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
"counted", state);
ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
} else {
WARNING("thread 0x%" PRIxPTR
", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid,
"wrong", state);
}
}
struct timespec abstime;
ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0);
abstime.tv_nsec += 1000000000l / 10;
if (abstime.tv_nsec >= 1000000000l) {
abstime.tv_nsec -= 1000000000l;
abstime.tv_sec += 1;
}
#if MDBX_DEBUG > 0
abstime.tv_sec += 600;
#endif
for (unsigned left;
(left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) {
NOTICE("tls-cleanup: pid %d, pending %u, wait for...", current_pid, left);
const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime);
if (rc && rc != EINTR)
break;
}
thread_key_delete(rthc_key);
#endif
for (size_t i = 0; i < rthc_count; ++i) {
MDBX_env *const env = rthc_table[i].env;
if (env->pid != current_pid)
continue;
if (!(env->flags & ENV_TXKEY))
continue;
reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0];
reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers];
thread_key_delete(env->me_txkey);
bool cleaned = false;
for (reader_slot_t *reader = begin; reader < end; ++reader) {
TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), "
"rthc-pid %i, current-pid %i",
i, (uintptr_t)env->me_txkey, __Wpedantic_format_voidptr(begin),
__Wpedantic_format_voidptr(end), __Wpedantic_format_voidptr(reader),
(int)(reader - begin), reader->pid.weak, current_pid);
if (atomic_load32(&reader->pid, mo_Relaxed) == current_pid) {
(void)atomic_cas32(&reader->pid, current_pid, 0);
TRACE("== cleanup %p", __Wpedantic_format_voidptr(reader));
cleaned = true;
}
}
if (cleaned)
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
}
rthc_limit = rthc_count = 0;
if (rthc_table != rthc_table_static)
osal_free(rthc_table);
rthc_table = nullptr;
rthc_unlock();
#if defined(_WIN32) || defined(_WIN64)
DeleteCriticalSection(&rthc_critical_section);
#else
/* LY: yielding a few timeslices to give a more chance
* to racing destructor(s) for completion. */
workaround_glibc_bug21031();
#endif
}

43
src/tls.h Normal file
View File

@ -0,0 +1,43 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
MDBX_INTERNAL void rthc_ctor(void);
MDBX_INTERNAL void rthc_dtor(const uint32_t current_pid);
MDBX_INTERNAL void rthc_lock(void);
MDBX_INTERNAL void rthc_unlock(void);
MDBX_INTERNAL int rthc_register(MDBX_env *const env);
MDBX_INTERNAL int rthc_remove(MDBX_env *const env);
MDBX_INTERNAL int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found);
/* dtor called for thread, i.e. for all mdbx's environment objects */
MDBX_INTERNAL void rthc_thread_dtor(void *rthc);
static inline void *thread_rthc_get(osal_thread_key_t key) {
#if defined(_WIN32) || defined(_WIN64)
return TlsGetValue(key);
#else
return pthread_getspecific(key);
#endif
}
MDBX_INTERNAL void thread_rthc_set(osal_thread_key_t key, const void *value);
#if !defined(_WIN32) && !defined(_WIN64)
MDBX_INTERNAL void rthc_afterfork(void);
MDBX_INTERNAL void workaround_glibc_bug21031(void);
#endif /* !Windows */
static inline void thread_key_delete(osal_thread_key_t key) {
TRACE("key = %" PRIuPTR, (uintptr_t)key);
#if defined(_WIN32) || defined(_WIN64)
ENSURE(nullptr, TlsFree(key));
#else
ENSURE(nullptr, pthread_key_delete(key) == 0);
workaround_glibc_bug21031();
#endif
}

View File

@ -1,17 +1,8 @@
/* mdbx_chk.c - memory-mapped database check tool */
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// mdbx_chk.c - memory-mapped database check tool
///
#ifdef _MSC_VER
#if _MSC_VER > 1800
@ -21,7 +12,7 @@
#endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
#include "internals.h"
#include "essentials.h"
#include <ctype.h>
@ -59,8 +50,7 @@ static void signal_handler(int sig) {
#define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1)
#define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE
enum MDBX_env_flags_t env_flags =
MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION;
MDBX_env_flags_t env_flags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION;
MDBX_env *env;
MDBX_txn *txn;
unsigned verbose = 0;
@ -70,8 +60,8 @@ int stuck_meta = -1;
MDBX_chk_context_t chk;
bool turn_meta = false;
bool force_turn_meta = false;
enum MDBX_chk_flags_t chk_flags = MDBX_CHK_DEFAULTS;
enum MDBX_chk_stage chk_stage = MDBX_chk_none;
MDBX_chk_flags_t chk_flags = MDBX_CHK_DEFAULTS;
MDBX_chk_stage_t chk_stage = MDBX_chk_none;
static MDBX_chk_line_t line_struct;
static size_t anchor_lineno;
@ -105,7 +95,7 @@ static bool silently(enum MDBX_chk_severity severity) {
chk.scope ? chk.scope->verbosity >> MDBX_chk_severity_prio_shift
: verbose + (MDBX_chk_result >> MDBX_chk_severity_prio_shift);
int prio = (severity >> MDBX_chk_severity_prio_shift);
if (chk.scope && chk.scope->stage == MDBX_chk_traversal_subdbs && verbose < 2)
if (chk.scope && chk.scope->stage == MDBX_chk_subdbs && verbose < 2)
prio += 1;
return quiet || cutoff < ((prio > 0) ? prio : 0);
}
@ -398,7 +388,7 @@ static int conclude(MDBX_chk_context_t *ctx) {
" at txn-id #%" PRIi64 "...",
ctx->result.recent_txnid);
flush();
err = error_fn("mdbx_env_pgwalk", mdbx_env_sync_ex(ctx->env, true, false));
err = error_fn("walk_pages", mdbx_env_sync_ex(ctx->env, true, false));
if (err == MDBX_SUCCESS) {
ctx->result.problems_meta -= 1;
ctx->result.total_problems -= 1;

View File

@ -1,17 +1,10 @@
/* mdbx_copy.c - memory-mapped database backup tool */
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// mdbx_copy.c - memory-mapped database backup tool
///
#ifdef _MSC_VER
#if _MSC_VER > 1800
@ -21,7 +14,7 @@
#endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
#include "internals.h"
#include "essentials.h"
#if defined(_WIN32) || defined(_WIN64)
#include "wingetopt.h"
@ -60,7 +53,7 @@ static void usage(const char *prog) {
int main(int argc, char *argv[]) {
int rc;
MDBX_env *env = NULL;
MDBX_env *env = nullptr;
const char *progname = argv[0], *act;
unsigned flags = MDBX_RDONLY;
unsigned cpflags = 0;
@ -123,7 +116,7 @@ int main(int argc, char *argv[]) {
"mdbx_copy %s (%s, T-%s)\nRunning for copy %s to %s...\n",
mdbx_version.git.describe, mdbx_version.git.datetime,
mdbx_version.git.tree, argv[1], (argc == 2) ? "stdout" : argv[2]);
fflush(NULL);
fflush(nullptr);
}
act = "opening environment";

View File

@ -1,19 +1,10 @@
/* mdbx_drop.c - memory-mapped database delete tool */
/*
* Copyright 2021-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
*
* Copyright 2016-2021 Howard Chu, Symas Corp.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2021-2024
///
/// mdbx_drop.c - memory-mapped database delete tool
///
#ifdef _MSC_VER
#if _MSC_VER > 1800
@ -23,7 +14,7 @@
#endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
#include "internals.h"
#include "essentials.h"
#include <ctype.h>
@ -162,7 +153,7 @@ int main(int argc, char *argv[]) {
goto env_close;
}
rc = mdbx_txn_begin(env, NULL, 0, &txn);
rc = mdbx_txn_begin(env, nullptr, 0, &txn);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_txn_begin", rc);
goto env_close;

View File

@ -1,17 +1,10 @@
/* mdbx_dump.c - memory-mapped database dump tool */
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// mdbx_dump.c - memory-mapped database dump tool
///
#ifdef _MSC_VER
#if _MSC_VER > 1800
@ -21,7 +14,7 @@
#endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
#include "internals.h"
#include "essentials.h"
#include <ctype.h>
@ -37,7 +30,7 @@ typedef struct flagbit {
flagbit dbflags[] = {{MDBX_REVERSEKEY, "reversekey"},
{MDBX_DUPSORT, "dupsort"},
{MDBX_INTEGERKEY, "integerkey"},
{MDBX_DUPFIXED, "dupfixed"},
{MDBX_DUPFIXED, "dupfix"},
{MDBX_INTEGERDUP, "integerdup"},
{MDBX_REVERSEDUP, "reversedup"},
{0, nullptr}};
@ -108,7 +101,7 @@ static void error(const char *func, int rc) {
/* Dump in BDB-compatible format */
static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
unsigned int flags;
unsigned flags;
int rc = mdbx_dbi_flags(txn, dbi, &flags);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_dbi_flags", rc);
@ -187,9 +180,11 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
return rc;
}
if (rescue) {
cursor->mc_checking |= CC_SKIPORD;
if (cursor->mc_xcursor)
cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD;
rc = mdbx_cursor_ignord(cursor);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_cursor_ignord", rc);
return rc;
}
}
while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) ==
@ -245,7 +240,7 @@ static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) {
}
int main(int argc, char *argv[]) {
int i, rc;
int i, err;
MDBX_env *env;
MDBX_txn *txn;
MDBX_dbi dbi;
@ -355,47 +350,47 @@ int main(int argc, char *argv[]) {
fflush(nullptr);
}
rc = mdbx_env_create(&env);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_env_create", rc);
err = mdbx_env_create(&env);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_env_create", err);
return EXIT_FAILURE;
}
if (alldbs || subname) {
rc = mdbx_env_set_maxdbs(env, 2);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_env_set_maxdbs", rc);
err = mdbx_env_set_maxdbs(env, 2);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_env_set_maxdbs", err);
goto env_close;
}
}
rc = mdbx_env_open(
err = mdbx_env_open(
env, envname,
envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION
: MDBX_RDONLY),
0);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_env_open", rc);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_env_open", err);
goto env_close;
}
if (warmup) {
rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
if (MDBX_IS_ERROR(rc)) {
error("mdbx_env_warmup", rc);
err = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
if (MDBX_IS_ERROR(err)) {
error("mdbx_env_warmup", err);
goto env_close;
}
}
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_txn_begin", rc);
err = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_txn_begin", err);
goto env_close;
}
rc = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &dbi);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_dbi_open", rc);
err = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &dbi);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_dbi_open", err);
goto txn_abort;
}
@ -403,24 +398,26 @@ int main(int argc, char *argv[]) {
assert(dbi == MAIN_DBI);
MDBX_cursor *cursor;
rc = mdbx_cursor_open(txn, MAIN_DBI, &cursor);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_cursor_open", rc);
err = mdbx_cursor_open(txn, MAIN_DBI, &cursor);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_cursor_open", err);
goto txn_abort;
}
if (rescue) {
cursor->mc_checking |= CC_SKIPORD;
if (cursor->mc_xcursor)
cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD;
err = mdbx_cursor_ignord(cursor);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_cursor_ignord", err);
return err;
}
}
bool have_raw = false;
int count = 0;
MDBX_val key;
while (MDBX_SUCCESS ==
(rc = mdbx_cursor_get(cursor, &key, nullptr, MDBX_NEXT_NODUP))) {
(err = mdbx_cursor_get(cursor, &key, nullptr, MDBX_NEXT_NODUP))) {
if (user_break) {
rc = MDBX_EINTR;
err = MDBX_EINTR;
break;
}
@ -428,7 +425,7 @@ int main(int argc, char *argv[]) {
continue;
subname = osal_realloc(buf4free, key.iov_len + 1);
if (!subname) {
rc = MDBX_ENOMEM;
err = MDBX_ENOMEM;
break;
}
@ -437,15 +434,15 @@ int main(int argc, char *argv[]) {
subname[key.iov_len] = '\0';
MDBX_dbi sub_dbi;
rc = mdbx_dbi_open_ex(txn, subname, MDBX_DB_ACCEDE, &sub_dbi,
rescue ? equal_or_greater : nullptr,
rescue ? equal_or_greater : nullptr);
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_INCOMPATIBLE) {
err = mdbx_dbi_open_ex(txn, subname, MDBX_DB_ACCEDE, &sub_dbi,
rescue ? equal_or_greater : nullptr,
rescue ? equal_or_greater : nullptr);
if (unlikely(err != MDBX_SUCCESS)) {
if (err == MDBX_INCOMPATIBLE) {
have_raw = true;
continue;
}
error("mdbx_dbi_open", rc);
error("mdbx_dbi_open", err);
if (!rescue)
break;
} else {
@ -453,13 +450,13 @@ int main(int argc, char *argv[]) {
if (list) {
printf("%s\n", subname);
} else {
rc = dump_sdb(txn, sub_dbi, subname);
if (unlikely(rc != MDBX_SUCCESS)) {
err = dump_sdb(txn, sub_dbi, subname);
if (unlikely(err != MDBX_SUCCESS)) {
if (!rescue)
break;
if (!quiet)
fprintf(stderr, "%s: %s: ignore %s for `%s` and continue\n", prog,
envname, mdbx_strerror(rc), subname);
envname, mdbx_strerror(err), subname);
/* Here is a hack for rescue mode, don't do that:
* - we should restart transaction in case error due
* database corruption;
@ -468,21 +465,21 @@ int main(int argc, char *argv[]) {
* - this is possible since DB is opened in read-only exclusive
* mode and transaction is the same, i.e. has the same address
* and so on. */
rc = mdbx_txn_reset(txn);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_txn_reset", rc);
err = mdbx_txn_reset(txn);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_txn_reset", err);
goto env_close;
}
rc = mdbx_txn_renew(txn);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_txn_renew", rc);
err = mdbx_txn_renew(txn);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_txn_renew", err);
goto env_close;
}
}
}
rc = mdbx_dbi_close(env, sub_dbi);
if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_dbi_close", rc);
err = mdbx_dbi_close(env, sub_dbi);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_dbi_close", err);
break;
}
}
@ -491,20 +488,20 @@ int main(int argc, char *argv[]) {
cursor = nullptr;
if (have_raw && (!count /* || rescue */))
rc = dump_sdb(txn, MAIN_DBI, nullptr);
err = dump_sdb(txn, MAIN_DBI, nullptr);
else if (!count) {
if (!quiet)
fprintf(stderr, "%s: %s does not contain multiple databases\n", prog,
envname);
rc = MDBX_NOTFOUND;
err = MDBX_NOTFOUND;
}
} else {
rc = dump_sdb(txn, dbi, subname);
err = dump_sdb(txn, dbi, subname);
}
switch (rc) {
switch (err) {
case MDBX_NOTFOUND:
rc = MDBX_SUCCESS;
err = MDBX_SUCCESS;
case MDBX_SUCCESS:
break;
case MDBX_EINTR:
@ -512,8 +509,8 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "Interrupted by signal/user\n");
break;
default:
if (unlikely(rc != MDBX_SUCCESS))
error("mdbx_cursor_get", rc);
if (unlikely(err != MDBX_SUCCESS))
error("mdbx_cursor_get", err);
}
mdbx_dbi_close(env, dbi);
@ -523,5 +520,5 @@ env_close:
mdbx_env_close(env);
free(buf4free);
return rc ? EXIT_FAILURE : EXIT_SUCCESS;
return err ? EXIT_FAILURE : EXIT_SUCCESS;
}

View File

@ -1,17 +1,10 @@
/* mdbx_load.c - memory-mapped database load tool */
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// mdbx_load.c - memory-mapped database load tool
///
#ifdef _MSC_VER
#if _MSC_VER > 1800
@ -21,7 +14,7 @@
#endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
#include "internals.h"
#include "essentials.h"
#include <ctype.h>
@ -139,7 +132,7 @@ typedef struct flagbit {
flagbit dbflags[] = {
{MDBX_REVERSEKEY, S("reversekey")}, {MDBX_DUPSORT, S("duplicates")},
{MDBX_DUPSORT, S("dupsort")}, {MDBX_INTEGERKEY, S("integerkey")},
{MDBX_DUPFIXED, S("dupfixed")}, {MDBX_INTEGERDUP, S("integerdup")},
{MDBX_DUPFIXED, S("dupfix")}, {MDBX_INTEGERDUP, S("integerdup")},
{MDBX_REVERSEDUP, S("reversedup")}, {0, 0, nullptr}};
static int readhdr(void) {
@ -375,7 +368,7 @@ static int badend(void) {
return errno ? errno : MDBX_ENODATA;
}
static __inline int unhex(unsigned char *c2) {
static inline int unhex(unsigned char *c2) {
int x, c;
x = *c2++ & 0x4f;
if (x & 0x40)

View File

@ -1,17 +1,10 @@
/* mdbx_stat.c - memory-mapped database status tool */
/*
* Copyright 2015-2024 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// mdbx_stat.c - memory-mapped database status tool
///
#ifdef _MSC_VER
#if _MSC_VER > 1800
@ -21,7 +14,7 @@
#endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */
#include "internals.h"
#include "essentials.h"
#if defined(_WIN32) || defined(_WIN64)
#include "wingetopt.h"

1645
src/tree.c Normal file

File diff suppressed because it is too large Load Diff

102
src/txl.c Normal file
View File

@ -0,0 +1,102 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
static inline size_t txl_size2bytes(const size_t size) {
assert(size > 0 && size <= txl_max * 2);
size_t bytes =
ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2),
txl_granulate * sizeof(txnid_t)) -
MDBX_ASSUME_MALLOC_OVERHEAD;
return bytes;
}
static inline size_t txl_bytes2size(const size_t bytes) {
size_t size = bytes / sizeof(txnid_t);
assert(size > 2 && size <= txl_max * 2);
return size - 2;
}
MDBX_INTERNAL txl_t txl_alloc(void) {
size_t bytes = txl_size2bytes(txl_initial);
txl_t txl = osal_malloc(bytes);
if (likely(txl)) {
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
bytes = malloc_usable_size(txl);
#endif /* malloc_usable_size */
txl[0] = txl_bytes2size(bytes);
assert(txl[0] >= txl_initial);
txl += 1;
*txl = 0;
}
return txl;
}
MDBX_INTERNAL void txl_free(txl_t txl) {
if (likely(txl))
osal_free(txl - 1);
}
MDBX_INTERNAL int txl_reserve(txl_t __restrict *__restrict ptxl,
const size_t wanna) {
const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptxl);
assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max &&
MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl));
if (likely(allocated >= wanna))
return MDBX_SUCCESS;
if (unlikely(wanna > /* paranoia */ txl_max)) {
ERROR("TXL too long (%zu > %zu)", wanna, (size_t)txl_max);
return MDBX_TXN_FULL;
}
const size_t size = (wanna + wanna - allocated < txl_max)
? wanna + wanna - allocated
: txl_max;
size_t bytes = txl_size2bytes(size);
txl_t txl = osal_realloc(*ptxl - 1, bytes);
if (likely(txl)) {
#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
bytes = malloc_usable_size(txl);
#endif /* malloc_usable_size */
*txl = txl_bytes2size(bytes);
assert(*txl >= wanna);
*ptxl = txl + 1;
return MDBX_SUCCESS;
}
return MDBX_ENOMEM;
}
static __always_inline int __must_check_result
txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max &&
MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl));
assert(num <= PAGELIST_LIMIT);
const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptxl) + num;
return likely(MDBX_PNL_ALLOCLEN(*ptxl) >= wanna) ? MDBX_SUCCESS
: txl_reserve(ptxl, wanna);
}
static __always_inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
assert(MDBX_PNL_GETSIZE(txl) < MDBX_PNL_ALLOCLEN(txl));
txl[0] += 1;
MDBX_PNL_LAST(txl) = id;
}
#define TXNID_SORT_CMP(first, last) ((first) > (last))
SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP)
MDBX_INTERNAL void txl_sort(txl_t txl) {
txnid_sort(MDBX_PNL_BEGIN(txl), MDBX_PNL_END(txl));
}
MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl,
txnid_t id) {
if (unlikely(MDBX_PNL_GETSIZE(*ptxl) == MDBX_PNL_ALLOCLEN(*ptxl))) {
int rc = txl_need(ptxl, txl_granulate);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
txl_xappend(*ptxl, id);
return MDBX_SUCCESS;
}

26
src/txl.h Normal file
View File

@ -0,0 +1,26 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
/* List of txnid */
typedef txnid_t *txl_t;
typedef const txnid_t *const_txl_t;
enum txl_rules {
txl_granulate = 32,
txl_initial =
txl_granulate - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t),
txl_max = (1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)
};
MDBX_INTERNAL txl_t txl_alloc(void);
MDBX_INTERNAL void txl_free(txl_t txl);
MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl,
txnid_t id);
MDBX_INTERNAL void txl_sort(txl_t txl);

1947
src/txn.c Normal file

File diff suppressed because it is too large Load Diff

242
src/unaligned.h Normal file
View File

@ -0,0 +1,242 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
/*------------------------------------------------------------------------------
* Unaligned access */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
field_alignment(size_t alignment_baseline, size_t field_offset) {
size_t merge = alignment_baseline | (size_t)field_offset;
return merge & -(int)merge;
}
/* read-thunk for UB-sanitizer */
MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
peek_u8(const uint8_t *__restrict ptr) {
return *ptr;
}
/* write-thunk for UB-sanitizer */
static inline void poke_u8(uint8_t *__restrict ptr, const uint8_t v) {
*ptr = v;
}
static inline void *bcopy_2(void *__restrict dst, const void *__restrict src) {
uint8_t *__restrict d = (uint8_t *)dst;
const uint8_t *__restrict s = (uint8_t *)src;
d[0] = s[0];
d[1] = s[1];
return d;
}
static inline void *bcopy_4(void *const __restrict dst,
const void *const __restrict src) {
uint8_t *__restrict d = (uint8_t *)dst;
const uint8_t *__restrict s = (uint8_t *)src;
d[0] = s[0];
d[1] = s[1];
d[2] = s[2];
d[3] = s[3];
return d;
}
static inline void *bcopy_8(void *const __restrict dst,
const void *const __restrict src) {
uint8_t *__restrict d = (uint8_t *)dst;
const uint8_t *__restrict s = (uint8_t *)src;
d[0] = s[0];
d[1] = s[1];
d[2] = s[2];
d[3] = s[3];
d[4] = s[4];
d[5] = s[5];
d[6] = s[6];
d[7] = s[7];
return d;
}
MDBX_NOTHROW_PURE_FUNCTION static inline uint16_t
unaligned_peek_u16(const size_t expected_alignment, const void *const ptr) {
assert((uintptr_t)ptr % expected_alignment == 0);
if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0)
return *(const uint16_t *)ptr;
else {
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
defined(_M_X64) || defined(_M_IA64)
return *(const __unaligned uint16_t *)ptr;
#else
uint16_t v;
bcopy_2((uint8_t *)&v, (const uint8_t *)ptr);
return v;
#endif /* _MSC_VER || __unaligned */
}
}
static inline void unaligned_poke_u16(const size_t expected_alignment,
void *const __restrict ptr,
const uint16_t v) {
assert((uintptr_t)ptr % expected_alignment == 0);
if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0)
*(uint16_t *)ptr = v;
else {
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
defined(_M_X64) || defined(_M_IA64)
*((uint16_t __unaligned *)ptr) = v;
#else
bcopy_2((uint8_t *)ptr, (const uint8_t *)&v);
#endif /* _MSC_VER || __unaligned */
}
}
MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t
unaligned_peek_u32(const size_t expected_alignment,
const void *const __restrict ptr) {
assert((uintptr_t)ptr % expected_alignment == 0);
if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0)
return *(const uint32_t *)ptr;
else if ((expected_alignment % sizeof(uint16_t)) == 0) {
const uint16_t lo =
((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
const uint16_t hi =
((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
return lo | (uint32_t)hi << 16;
} else {
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
defined(_M_X64) || defined(_M_IA64)
return *(const __unaligned uint32_t *)ptr;
#else
uint32_t v;
bcopy_4((uint8_t *)&v, (const uint8_t *)ptr);
return v;
#endif /* _MSC_VER || __unaligned */
}
}
static inline void unaligned_poke_u32(const size_t expected_alignment,
void *const __restrict ptr,
const uint32_t v) {
assert((uintptr_t)ptr % expected_alignment == 0);
if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0)
*(uint32_t *)ptr = v;
else if ((expected_alignment % sizeof(uint16_t)) == 0) {
((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v;
((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
(uint16_t)(v >> 16);
} else {
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
defined(_M_X64) || defined(_M_IA64)
*((uint32_t __unaligned *)ptr) = v;
#else
bcopy_4((uint8_t *)ptr, (const uint8_t *)&v);
#endif /* _MSC_VER || __unaligned */
}
}
MDBX_NOTHROW_PURE_FUNCTION static inline uint64_t
unaligned_peek_u64(const size_t expected_alignment,
const void *const __restrict ptr) {
assert((uintptr_t)ptr % expected_alignment == 0);
if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0)
return *(const uint64_t *)ptr;
else if ((expected_alignment % sizeof(uint32_t)) == 0) {
const uint32_t lo =
((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
const uint32_t hi =
((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
return lo | (uint64_t)hi << 32;
} else {
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
defined(_M_X64) || defined(_M_IA64)
return *(const __unaligned uint64_t *)ptr;
#else
uint64_t v;
bcopy_8((uint8_t *)&v, (const uint8_t *)ptr);
return v;
#endif /* _MSC_VER || __unaligned */
}
}
static inline uint64_t
unaligned_peek_u64_volatile(const size_t expected_alignment,
const volatile void *const __restrict ptr) {
assert((uintptr_t)ptr % expected_alignment == 0);
assert(expected_alignment % sizeof(uint32_t) == 0);
if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0)
return *(const volatile uint64_t *)ptr;
else {
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
defined(_M_X64) || defined(_M_IA64)
return *(const volatile __unaligned uint64_t *)ptr;
#else
const uint32_t lo = ((const volatile uint32_t *)
ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
const uint32_t hi = ((const volatile uint32_t *)
ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
return lo | (uint64_t)hi << 32;
#endif /* _MSC_VER || __unaligned */
}
}
static inline void unaligned_poke_u64(const size_t expected_alignment,
void *const __restrict ptr,
const uint64_t v) {
assert((uintptr_t)ptr % expected_alignment == 0);
if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0)
*(uint64_t *)ptr = v;
else if ((expected_alignment % sizeof(uint32_t)) == 0) {
((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v;
((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
(uint32_t)(v >> 32);
} else {
#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \
defined(_M_X64) || defined(_M_IA64)
*((uint64_t __unaligned *)ptr) = v;
#else
bcopy_8((uint8_t *)ptr, (const uint8_t *)&v);
#endif /* _MSC_VER || __unaligned */
}
}
#define UNALIGNED_PEEK_8(ptr, struct, field) \
peek_u8(ptr_disp(ptr, offsetof(struct, field)))
#define UNALIGNED_POKE_8(ptr, struct, field, value) \
poke_u8(ptr_disp(ptr, offsetof(struct, field)), value)
#define UNALIGNED_PEEK_16(ptr, struct, field) \
unaligned_peek_u16(1, ptr_disp(ptr, offsetof(struct, field)))
#define UNALIGNED_POKE_16(ptr, struct, field, value) \
unaligned_poke_u16(1, ptr_disp(ptr, offsetof(struct, field)), value)
#define UNALIGNED_PEEK_32(ptr, struct, field) \
unaligned_peek_u32(1, ptr_disp(ptr, offsetof(struct, field)))
#define UNALIGNED_POKE_32(ptr, struct, field, value) \
unaligned_poke_u32(1, ptr_disp(ptr, offsetof(struct, field)), value)
#define UNALIGNED_PEEK_64(ptr, struct, field) \
unaligned_peek_u64(1, ptr_disp(ptr, offsetof(struct, field)))
#define UNALIGNED_POKE_64(ptr, struct, field, value) \
unaligned_poke_u64(1, ptr_disp(ptr, offsetof(struct, field)), value)
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t
peek_pgno(const void *const __restrict ptr) {
if (sizeof(pgno_t) == sizeof(uint32_t))
return (pgno_t)unaligned_peek_u32(1, ptr);
else if (sizeof(pgno_t) == sizeof(uint64_t))
return (pgno_t)unaligned_peek_u64(1, ptr);
else {
pgno_t pgno;
memcpy(&pgno, ptr, sizeof(pgno));
return pgno;
}
}
static inline void poke_pgno(void *const __restrict ptr, const pgno_t pgno) {
if (sizeof(pgno) == sizeof(uint32_t))
unaligned_poke_u32(1, ptr, pgno);
else if (sizeof(pgno) == sizeof(uint64_t))
unaligned_poke_u64(1, ptr, pgno);
else
memcpy(ptr, &pgno, sizeof(pgno));
}

35
src/utils.c Normal file
View File

@ -0,0 +1,35 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned
log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX &&
is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);
const uint32_t value_uint32 = (uint32_t)value_uintptr;
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz)
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned));
return __builtin_ctz(value_uint32);
#elif defined(_MSC_VER)
unsigned long index;
STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long));
_BitScanForward(&index, value_uint32);
return index;
#else
static const uint8_t debruijn_ctz32[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27];
#endif
}
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v) {
/* Pelle Evensen's mixer, https://bit.ly/2HOfynt */
v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50);
v *= UINT64_C(0xA24BAED4963EE407);
v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49);
v *= UINT64_C(0x9FB21C651E98DF25);
return v ^ v >> 28;
}

87
src/utils.h Normal file
View File

@ -0,0 +1,87 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
/* Test if the flags f are set in a flag word w. */
#define F_ISSET(w, f) (((w) & (f)) == (f))
/* Round n up to an even number. */
#define EVEN_CEIL(n) (((n) + 1UL) & -2L) /* sign-extending -2 to match n+1U */
/* Round n down to an even number. */
#define EVEN_FLOOR(n) ((n) & ~(size_t)1)
/*
* /
* | -1, a < b
* CMP2INT(a,b) = < 0, a == b
* | 1, a > b
* \
*/
#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0)
/* Pointer displacement without casting to char* to avoid pointer-aliasing */
#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp))))
/* Pointer distance as signed number of bytes */
#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less)))
#define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \
do { \
TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \
(size_t)(size), __LINE__); \
ASAN_POISON_MEMORY_REGION(addr, size); \
} while (0)
#define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \
do { \
TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \
(size_t)(size), __LINE__); \
ASAN_UNPOISON_MEMORY_REGION(addr, size); \
} while (0)
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
branchless_abs(intptr_t value) {
assert(value > INT_MIN);
const size_t expanded_sign =
(size_t)(value >> (sizeof(value) * CHAR_BIT - 1));
return ((size_t)value + expanded_sign) ^ expanded_sign;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline bool
is_powerof2(size_t x) {
return (x & (x - 1)) == 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
floor_powerof2(size_t value, size_t granularity) {
assert(is_powerof2(granularity));
return value & ~(granularity - 1);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t
ceil_powerof2(size_t value, size_t granularity) {
return floor_powerof2(value + granularity - 1, granularity);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned
log2n_powerof2(size_t value_uintptr);
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v);
struct monotime_cache {
uint64_t value;
int expire_countdown;
};
MDBX_MAYBE_UNUSED static inline uint64_t
monotime_since_cached(uint64_t begin_timestamp, struct monotime_cache *cache) {
if (cache->expire_countdown)
cache->expire_countdown -= 1;
else {
cache->value = osal_monotime();
cache->expire_countdown = 42 / 3;
}
return cache->value - begin_timestamp;
}

314
src/walk.c Normal file
View File

@ -0,0 +1,314 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
typedef struct walk_ctx {
void *userctx;
walk_options_t options;
int deep;
walk_func *visitor;
MDBX_txn *txn;
MDBX_cursor *cursor;
} walk_ctx_t;
__cold static int walk_sdb(walk_ctx_t *ctx, walk_sdb_t *sdb);
static page_type_t walk_page_type(const page_t *mp) {
if (mp)
switch (mp->flags & ~P_SPILLED) {
case P_BRANCH:
return page_branch;
case P_LEAF:
return page_leaf;
case P_LEAF | P_DUPFIX:
return page_dupfix_leaf;
case P_LARGE:
return page_large;
}
return page_broken;
}
static page_type_t walk_subpage_type(const page_t *sp) {
switch (sp->flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) {
case P_LEAF | P_SUBP:
return page_sub_leaf;
case P_LEAF | P_DUPFIX | P_SUBP:
return page_sub_dupfix_leaf;
default:
return page_sub_broken;
}
}
/* Depth-first tree traversal. */
__cold static int walk_pgno(walk_ctx_t *ctx, walk_sdb_t *sdb, const pgno_t pgno,
txnid_t parent_txnid) {
assert(pgno != P_INVALID);
page_t *mp = nullptr;
int err = page_get(ctx->cursor, pgno, &mp, parent_txnid);
const page_type_t type = walk_page_type(mp);
const size_t nentries = mp ? page_numkeys(mp) : 0;
size_t header_size =
(mp && !is_dupfix_leaf(mp)) ? PAGEHDRSZ + mp->lower : PAGEHDRSZ;
size_t payload_size = 0;
size_t unused_size =
(mp ? page_room(mp) : ctx->txn->env->ps - header_size) - payload_size;
size_t align_bytes = 0;
for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) {
if (type == page_dupfix_leaf) {
/* DUPFIX pages have no entries[] or node headers */
payload_size += mp->dupfix_ksize;
continue;
}
const node_t *node = page_node(mp, i);
header_size += NODESIZE;
const size_t node_key_size = node_ks(node);
payload_size += node_key_size;
if (type == page_branch) {
assert(i > 0 || node_ks(node) == 0);
align_bytes += node_key_size & 1;
continue;
}
const size_t node_data_size = node_ds(node);
assert(type == page_leaf);
switch (node_flags(node)) {
case 0 /* usual node */:
payload_size += node_data_size;
align_bytes += (node_key_size + node_data_size) & 1;
break;
case N_BIGDATA /* long data on the large/overflow page */: {
const pgno_t large_pgno = node_largedata_pgno(node);
const size_t over_payload = node_data_size;
const size_t over_header = PAGEHDRSZ;
assert(err == MDBX_SUCCESS);
pgr_t lp = page_get_large(ctx->cursor, large_pgno, mp->txnid);
const size_t npages =
((err = lp.err) == MDBX_SUCCESS) ? lp.page->pages : 1;
const size_t pagesize = pgno2bytes(ctx->txn->env, npages);
const size_t over_unused = pagesize - over_payload - over_header;
const int rc = ctx->visitor(large_pgno, npages, ctx->userctx, ctx->deep,
sdb, pagesize, page_large, err, 1,
over_payload, over_header, over_unused);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
payload_size += sizeof(pgno_t);
align_bytes += node_key_size & 1;
} break;
case N_SUBDATA /* sub-db */: {
if (unlikely(node_data_size != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid subDb node size", (unsigned)node_data_size);
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
}
header_size += node_data_size;
align_bytes += (node_key_size + node_data_size) & 1;
} break;
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
if (unlikely(node_data_size != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid sub-tree node size", (unsigned)node_data_size);
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
}
header_size += node_data_size;
align_bytes += (node_key_size + node_data_size) & 1;
break;
case N_DUPDATA /* short sub-page */: {
if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid sub-page node size", (unsigned)node_data_size);
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
break;
}
const page_t *const sp = node_data(node);
const page_type_t subtype = walk_subpage_type(sp);
const size_t nsubkeys = page_numkeys(sp);
if (unlikely(subtype == page_sub_broken)) {
ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid sub-page flags", sp->flags);
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
}
size_t subheader_size =
is_dupfix_leaf(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->lower;
size_t subunused_size = page_room(sp);
size_t subpayload_size = 0;
size_t subalign_bytes = 0;
for (size_t ii = 0; err == MDBX_SUCCESS && ii < nsubkeys; ++ii) {
if (subtype == page_sub_dupfix_leaf) {
/* DUPFIX pages have no entries[] or node headers */
subpayload_size += sp->dupfix_ksize;
} else {
assert(subtype == page_sub_leaf);
const node_t *subnode = page_node(sp, ii);
const size_t subnode_size = node_ks(subnode) + node_ds(subnode);
subheader_size += NODESIZE;
subpayload_size += subnode_size;
subalign_bytes += subnode_size & 1;
if (unlikely(node_flags(subnode) != 0)) {
ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"unexpected sub-node flags", node_flags(subnode));
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
}
}
}
const int rc =
ctx->visitor(pgno, 0, ctx->userctx, ctx->deep + 1, sdb,
node_data_size, subtype, err, nsubkeys, subpayload_size,
subheader_size, subunused_size + subalign_bytes);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
header_size += subheader_size;
unused_size += subunused_size;
payload_size += subpayload_size;
align_bytes += subalign_bytes + (node_key_size & 1);
} break;
default:
ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid node flags", node_flags(node));
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
}
}
const int rc = ctx->visitor(
pgno, 1, ctx->userctx, ctx->deep, sdb, ctx->txn->env->ps, type, err,
nentries, payload_size, header_size, unused_size + align_bytes);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) {
if (type == page_dupfix_leaf)
continue;
node_t *node = page_node(mp, i);
if (type == page_branch) {
assert(err == MDBX_SUCCESS);
ctx->deep += 1;
err = walk_pgno(ctx, sdb, node_pgno(node), mp->txnid);
ctx->deep -= 1;
if (unlikely(err != MDBX_SUCCESS)) {
if (err == MDBX_RESULT_TRUE)
break;
return err;
}
continue;
}
assert(type == page_leaf);
switch (node_flags(node)) {
default:
continue;
case N_SUBDATA /* sub-db */:
if (unlikely(node_ds(node) != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid sub-tree node size", (unsigned)node_ds(node));
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
} else {
tree_t aligned_db;
memcpy(&aligned_db, node_data(node), sizeof(aligned_db));
walk_sdb_t subdb = {{node_key(node), node_ks(node)}, nullptr, nullptr};
subdb.internal = &aligned_db;
assert(err == MDBX_SUCCESS);
ctx->deep += 1;
err = walk_sdb(ctx, &subdb);
ctx->deep -= 1;
}
break;
case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */:
if (unlikely(node_ds(node) != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED,
"invalid dupsort sub-tree node size", (unsigned)node_ds(node));
assert(err == MDBX_CORRUPTED);
err = MDBX_CORRUPTED;
} else {
tree_t aligned_db;
memcpy(&aligned_db, node_data(node), sizeof(aligned_db));
assert(err == MDBX_SUCCESS);
err = cursor_dupsort_setup(ctx->cursor, node, mp);
if (likely(err == MDBX_SUCCESS)) {
assert(ctx->cursor->subcur ==
&container_of(ctx->cursor, cursor_couple_t, outer)->inner);
ctx->cursor = &ctx->cursor->subcur->cursor;
ctx->deep += 1;
sdb->nested = &aligned_db;
err = walk_pgno(ctx, sdb, aligned_db.root, mp->txnid);
sdb->nested = nullptr;
ctx->deep -= 1;
subcur_t *inner_xcursor = container_of(ctx->cursor, subcur_t, cursor);
cursor_couple_t *couple =
container_of(inner_xcursor, cursor_couple_t, inner);
ctx->cursor = &couple->outer;
}
}
break;
}
}
return MDBX_SUCCESS;
}
__cold static int walk_sdb(walk_ctx_t *ctx, walk_sdb_t *sdb) {
tree_t *const db = sdb->internal;
if (unlikely(db->root == P_INVALID))
return MDBX_SUCCESS; /* empty db */
kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}};
cursor_couple_t couple;
int rc = cursor_init4walk(&couple, ctx->txn, db, &kvx);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
const uint8_t cursor_checking = (ctx->options & dont_check_keys_ordering)
? z_pagecheck | z_ignord
: z_pagecheck;
couple.outer.checking |= cursor_checking;
couple.inner.cursor.checking |= cursor_checking;
couple.outer.next = ctx->cursor;
couple.outer.top_and_flags = z_disable_tree_search_fastpath;
ctx->cursor = &couple.outer;
rc = walk_pgno(ctx, sdb, db->root,
db->mod_txnid ? db->mod_txnid : ctx->txn->txnid);
ctx->cursor = couple.outer.next;
return rc;
}
__cold int walk_pages(MDBX_txn *txn, walk_func *visitor, void *user,
walk_options_t options) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
walk_ctx_t ctx = {
.txn = txn, .userctx = user, .visitor = visitor, .options = options};
walk_sdb_t sdb = {.name = {.iov_base = MDBX_CHK_GC},
.internal = &txn->dbs[FREE_DBI]};
rc = walk_sdb(&ctx, &sdb);
if (!MDBX_IS_ERROR(rc)) {
sdb.name.iov_base = MDBX_CHK_MAIN;
sdb.internal = &txn->dbs[MAIN_DBI];
rc = walk_sdb(&ctx, &sdb);
}
return rc;
}

Some files were not shown because too many files have changed in this diff Show More