Compare commits

..

No commits in common. "master" and "v0.8.0" have entirely different histories.

215 changed files with 35461 additions and 74403 deletions

20
.circleci/config.yml Normal file
View File

@ -0,0 +1,20 @@
version: 2
jobs:
build:
docker:
- image: circleci/buildpack-deps:20.04
environment:
- TESTDB: /tmp/test.db
- TESTLOG: /tmp/test.log
steps:
- checkout
- run: make all
- run: ulimit -c unlimited && make check
- run:
command: |
mkdir -p /tmp/artifacts
mv -t /tmp/artifacts $TESTLOG $TESTDB core.*
when: on_fail
- store_artifacts:
path: /tmp/artifacts
destination: test-artifacts

6
.cirrus.yml Normal file
View File

@ -0,0 +1,6 @@
freebsd_instance:
image_family: freebsd-12-1-snap
task:
install_script: pkg install -y gmake bash git
script: git fetch --tags && gmake check

View File

@ -1,3 +1,3 @@
BasedOnStyle: LLVM BasedOnStyle: LLVM
Standard: c++20 Standard: Cpp11
ColumnLimit: 120 ReflowComments: true

View File

@ -1,3 +0,0 @@
format:
line_width: 120
tab_size: 2

54
.github/workflows/release-assets.yml vendored Normal file
View File

@ -0,0 +1,54 @@
# Based on the https://github.com/actions/upload-release-asset example
on:
push:
# Sequence of patterns matched against refs/tags
tags:
- 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10
name: Upload Release Asset
jobs:
build:
name: Upload Release Asset
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Build assets
run: |
make release-assets
- id: name
run: |
echo "::set-output name=tarball::$(ls *.tar.gz)"
echo "::set-output name=zip::$(ls *.zip)"
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ github.ref }}
release_name: Release ${{ github.ref }}
draft: true
prerelease: true
- name: Upload tarball
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ${{ steps.name.outputs.tarball }}
asset_name: amalgamated.tar.gz
# asset_label: Amalgamated source tarball
asset_content_type: application/tar+gzip
- name: Upload zip
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ${{ steps.name.outputs.zip }}
asset_name: amalgamated.zip
# asset_label: Amalgamated source zip-archive
asset_content_type: application/zip

72
.gitignore vendored
View File

@ -1,69 +1,35 @@
*[~#]
@*
*.[ao] *.[ao]
*.autosave
*.bak *.bak
build.ninja
cmake-build-*
CMakeCache.txt
CMakeFiles/
cmake_install.cmake
CMakeLists.txt.user
core
CTestTestfile.cmake
DartConfiguration.tcl
dist/
*.dll
docs/Doxyfile
docs/html/
*.dSYM
*.dylib
*.err
*.exe *.exe
*.gcda *.gcda
*.gcno *.gcno
*.gcov *.gcov
.idea
libmdbx.creator.user
*.lo *.lo
mdbx_chk
mdbx_copy
mdbx_drop
mdbx_dump
mdbx_example
mdbx_load
mdbx_stat
mdbx_test
.ninja_deps
.ninja_log
*.orig *.orig
*.rej *.rej
*.so *.so
src/config.h *[~#]
src/version.c .idea
*.tar* .le.ini
test/cmake_install.cmake .vs/
test/CTestTestfile.cmake cmake-build-*
test_extra_crunched_delete @*
test_extra_cursor_closing core
test_extra_dbi mdbx_example
test_extra_doubtless_positioning libmdbx.creator.user
test_extra_dupfix_addodd mdbx_chk
test_extra_dupfix_multiple mdbx_copy
test_extra_early_close_dbi mdbx_dump
test_extra_hex_base64_base58 mdbx_load
test_extra_maindb_ordinal mdbx_stat
test_extra_open mdbx_test
test_extra_pcrf
test_extra_upsert_alldups
Testing/
test.log test.log
test/tmp.db test/tmp.db
test/tmp.db-lck test/tmp.db-lck
tmp.db tmp.db
tmp.db-lck tmp.db-lck
valgrind.* valgrind.*
version.c src/elements/version.c
.vs/ src/elements/config.h
.vscode/ dist/
*.zip *.tar*

40
.le.ini
View File

@ -1,40 +0,0 @@
tabsize=8
indentsize=8
autoindent=1
bsunindents=1
insert=1
inputmode=0
editmode=1
makebak=0
bakpath=
make=exec make
shell=exec $SHELL
run=exec make run
compile=exec make "$FNAME.o"
scroll=1
hscroll=32
rblock=0
savepos=1
savehst=1
noreg=1
match_case=1
linelen=72
leftmrg=0
flnmarg=0
leftadj=1
rightadj=0
helpcmd=exec /usr/share/le/help
usecolor=1
usetabs=1
scrollbar=0
statusline=0
backupext=.~%d~
backupnum=9
preferpagetop=1
wordwrap=0
syntaxhl=1
undo_enable=1
undo_min_levels=4
undo_max_memory=128
undo_glue=1
usemouse=0

88
.travis.yml Normal file
View File

@ -0,0 +1,88 @@
language: c cpp
sudo: false
env:
global:
- secure: "M+W+heGGyRQJoBq2W0uqWVrpL4KBXmL0MFL7FSs7f9vmAaDyEgziUXeZRj3GOKzW4kTef3LpIeiu9SmvqSMoQivGGiomZShqPVl045o/OUgRCAT7Al1RLzEZ0efSHpIPf0PZ6byEf6GR2ML76OfuL6JxTVdnz8iVyO2sgLE1HbX1VeB+wgd/jfMeOBhCCXskfK6MLyZihfMYsiYZYSaV98ZDhDLSlzuuRIgzb0bMi8aL6AErs0WLW0NelRBeHkKPYfAUc85pdQHscgrJw6Rh/zT6+8BQ/q5f4IgWhiu4xoRg3Ngl7SNoedRQh93ADM3UG2iGl6HDFpVORaXcFWKAtuYY+kHQ0HB84BRYpQmeBuXNpltsfxQ3d1Q3u0RlE45zRvmr2+X1mFnkcNUAWISLPbsOUlriDQM8irGwRpho77/uYnRC00bJsHW//s6+uPf9zrAw1nI4f0y3PAWukGF/xs6HAI3FZPsuSSnx18Tj3Opgbc9Spop+V3hkhdiJoPGpNKTkFX4ZRXfkPgoRVJmtp4PpbpH0Ps/mCriKjMEfGGi0HcVCi0pEGLXiecdqJ5KPg5+22zNycEujQBJcNTKd9shN+R3glrbmhAxTEzGdGwxXXJ2ybwJ2PWJLMYZ7g98nLyX+uQPaA3BlsbYJHNeS5283/9pJsd9DzfHKsN2nFSc="
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- cmake
- clang-format
update: true
matrix:
include:
- os: linux
dist: focal
compiler: gcc
env: CC=cc CXX=c++
- os: linux
dist: focal
compiler: clang
env: CC=clang CXX=clang++
- os: linux
dist: bionic
compiler: gcc
env: CC=cc CXX=c++
- os: linux
dist: bionic
compiler: clang
env: CC=clang CXX=clang++
- os: linux
dist: xenial
compiler: gcc
env: CC=cc CXX=c++
- os: linux
dist: xenial
compiler: clang
env: CC=clang CXX=clang++
- os: osx
osx_image: xcode11.3
env: CC=cc CXX=c++
- os: osx
osx_image: xcode9.4
env: CC=cc CXX=c++
before_script: |
if [ "${TRAVIS_BRANCH}" = "coverity_scan" ]; then
# call Coverity Scan manually of addons.coverity_scan for first job only
if [ "${TRAVIS_JOB_NUMBER}" = "${TRAVIS_BUILD_NUMBER}.1" ]; then
export COVERITY_SCAN_BRANCH=1
echo -n | openssl s_client -connect scan.coverity.com:443 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | sudo tee -a /etc/ssl/certs/ca-
curl -s 'https://scan.coverity.com/scripts/travisci_build_coverity_scan.sh' -o coverity_scan.sh
else
echo 'echo "Skip CoverityScan for unrelated os/compiler"' > coverity_scan.sh
fi
fi
script: |
${CC} --version
${CXX} --version
git fetch --unshallow --tags --prune || exit 1
if [ ! -s ./coverity_scan.sh ]; then
make --keep-going all && MALLOC_CHECK_=7 MALLOC_PERTURB_=42 make --keep-going check
else
COVERITY_SCAN_PROJECT_NAME="ReOpen/libmdbx" \
COVERITY_SCAN_NOTIFICATION_EMAIL="leo@yuriev.ru" \
COVERITY_SCAN_BUILD_COMMAND_PREPEND="" \
COVERITY_SCAN_BUILD_COMMAND="make MDBX_OPTIONS=-DMDBX_DEBUG=2 build-test" \
COVERITY_SCAN_BRANCH_PATTERN="$TRAVIS_BRANCH" \
bash ./coverity_scan.sh || cat cov-int/scm_log.txt
fi
after_script: |
if [ "${TRAVIS_BRANCH}" != "coverity_scan" -a "${TRAVIS_JOB_NUMBER}" = "${TRAVIS_BUILD_NUMBER}.1" ] && make reformat && [[ -n $(git diff) ]]; then
echo "You must run 'make reformat' before submitting a pull request"
echo "-------------------------------------------------------------------------------"
git diff
sleep 1
echo "-------------------------------------------------------------------------------"
sleep 1
exit -1
fi
echo "-------------------------------------------------------------------------------"
sleep 1

32
AUTHORS Normal file
View File

@ -0,0 +1,32 @@
Contributors
============
Alexey Naumov <alexey.naumov@gmail.com>
Chris Mikkelson <cmikk@qwest.net>
Claude Brisson <claude.brisson@gmail.com>
David Barbour <dmbarbour@gmail.com>
David Wilson <dw@botanicus.net>
dreamsxin <dreamsxin@126.com>
Hallvard Furuseth <hallvard@openldap.org>, <h.b.furuseth@usit.uio.no>
Heiko Becker <heirecka@exherbo.org>
Howard Chu <hyc@openldap.org>, <hyc@symas.com>
Ignacio Casal Quinteiro <ignacio.casal@nice-software.com>
James Rouzier <rouzier@gmail.com>
Jean-Christophe DUBOIS <jcd@tribudubois.net>
John Hewson <john@jahewson.com>
Klaus Malorny <klaus.malorny@knipp.de>
Kurt Zeilenga <kurt.zeilenga@isode.com>
Leonid Yuriev <leo@yuriev.ru>, <lyuryev@ptsecurity.com>
Lorenz Bauer <lmb@cloudflare.com>
Luke Yeager <lyeager@nvidia.com>
Martin Hedenfalk <martin@bzero.se>
Ondrej Kuznik <ondrej.kuznik@acision.com>
Orivej Desh <orivej@gmx.fr>
Oskari Timperi <oskari.timperi@iki.fi>
Pavel Medvedev <pmedvedev@gmail.com>
Philipp Storz <philipp.storz@bareos.com>
Quanah Gibson-Mount <quanah@openldap.org>
Salvador Ortiz <sog@msg.com.mx>
Sebastien Launay <sebastien@slaunay.fr>
Vladimir Romanov <vromanov@gmail.com>
Zano Foundation <crypto.sowle@gmail.com>

File diff suppressed because it is too large Load Diff

159
COPYRIGHT
View File

@ -1,139 +1,7 @@
Copyright (c) 2015-2025 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
Copyright 2011-2015 Howard Chu, Symas Corp.
Licensed under the Apache License, Version 2.0 (the "License"); Copyright 2015,2016 Peter-Service R&D LLC.
you may not use this file except in compliance with the License. All rights reserved.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-----------------------------------------------------------------------
СМЕНА ЛИЦЕНЗИИ (THE LICENSE CHANGE)
OpenLDAP Public License → Apache 2.0
Briefly:
Historically, in 2015 an early MDBX source code was derived from the
"LMDB engine" created by Howard Chu <hyc@symas.com> in 2011-2015,
which based on btree.c written by Martin Hedenfalk <martin@bzero.se>.
By 2024, MDBX source code has actually been rewritten and has so
little in common with the original LMDB that I thought it admissible
to change the license. Below are more detailed explanations.
Кратко:
Исторически в 2015 году ранний исходный код MDBX был заимствован из
«LMDB engine», созданной Howard Chu <hyc@symas.com> в 2011-2015,
на основе btree.c, ранее созданного Martin Hedenfalk <martin@bzero.se>.
К 2024 году исходный код MDBX фактически переписан и имеет настолько
мало общего с первоначальным заимствованием из LMDB, что я счел
уместным сменить лицензию. Ниже более подробные пояснения.
---
Первоисточник текста формулирован на Русском языке, который является
родным для автора. Предполагается что все заинтересованные могут легко
воспользоваться машинным переводом, который при всех недостатках сможет
донести суть, намерения и местами даже передать тональность.
The original source of this text is in Russian, which is the author's
native language. It is assumed that all concerned can easily use machine
translation, which, with all the disadvantages, will be able to convey
the essence, intentions and, in some places, even convey the tonality of
a wording.
1. Причины
1.1. Лицензия Apache-2.0 является одной из самых популярных, так как
содержит ряд уточнений, проясняющих и упрощающих использование исходного
кода в производных работах и больших проектах. Эти особенности лицензии
Apache-2.0 я нахожу достаточно ценными и удобными. Соответственно,
переход на лицензию Apache-2.0 полезным в целом.
1.2. Проект OpenLDAP имеет определенную известность, в том числе, к
сожалению, среди специалистов славится кране плохим качеством кода и
сбоями при отходе от простых/базовых сценариев использования. Поэтому
использование лицензии OpenLDAP, в глазах части аудитории, бросает тень
на качества кода libmdbx, несмотря на то, что исходный код библиотеки
переписан, в том числе, с целью повышения качества, надежности,
стабильности и пригодности к тестированию.
Отмечу, что здесь не место для обсуждения объективности подобных мнений
и причин, равно как и не место для оценки компетентности специалистов
высказывающих такие суждения. Однако, здесь необходимо озвучить сам факт
наличия такой негативной коннотации качества кода при упоминании
OpenLDAP, совершенно без намерения как-либо задеть или обидеть
контрибьюторов OpenLDAP.
1.3. С точки зрения исходного кода, к настоящему времени libmdbx стала
совсем другим продуктом, о котором сейчас правильнее сказать что
разработка вдохновлена LMDB, нежели основывается на заимствовании кода.
Смена лицензии на переписанный код подчеркивает, что это действительно
новый исходный код.
2. Легитимность
2.1. Исходная лицензия OpenLDAP 2.8 и актуальная лицензия Apache 2.0
совпадают по базовым условиям. При этом лицензия Apache 2.0 уточняет,
определяет и проясняет многие аспекты. Поэтому смену лицензии я склонен
трактовать как уточнение, но НЕ как принципиальное изменение, которое
могло-бы нарушить чьи-либо права.
2.2. С процедурной точки зрения, у меня есть право сменить лицензию на
новый, написанный мной, исходный код. При этом объективно существует как
техническая, так и юридическая проблемы отделения «нового кода» от
«заимствованного», а также выделение/классификация кода, который
является общественным достоянием и/или общеупотребительным воплощением
«математических моделей и других публичных знаний».
Основываясь на собственной субъективной оценке кодовой базы, включая
соотношения «нового», «заимствованного» и «общеупотребительного»
исходного кода, я считаю что смена лицензии допустима. Одновременно с
этим, я понимаю и признаю, что можно найти повод, чтобы трактовать
ситуацию как «стакан наполовину полон/пуст». Поэтому декларирую
готовность принимать претензии и устранять их путем полного
переписывания оставшегося исходного кода, который попадает под критерии
«заимствованного» и кто-то из контрибьюторов которого будет против
изменения лицензии.
2.3. Вне зависимости от истории происхождения каждой строки исходного
кода и её буквального авторства, прошу не считать производимую смену
лицензии, и связанных с этим технических действий, как попытку плагиата,
присвоения чужого труда, присвоения авторства или принижения вклада
других авторов/контрибьторов. Безусловно проект MDBX/libmdbx не появился
бы без LMDB и всех участников проекта LMDB, в особенности Говарда Чу
(Howard Chu), Холлварда Фурусет (Hallvard Furuseth) и Мартина Хеденфок
(Martin Hedenfalk). Как-бы исходный код не переписывался он всё равно
будет основываться на базовых идеях и включать основные концепции LMDB.
3. Последствия и актуальные требования
Всё очень просто. Потребуется обеспечить требования новой лицензии в
соответствии с 4-м пунктом лицензции Apache 2.0.
В частности, при использовании/распространении libmdbx потребуется
обеспечить наличие файлов с текстом лицензии и файла NOTICE, а также
обеспечить пользователям возможность ознакомиться с их содержимым в
работах/продуктах использующих libmdbx.
-----------------------------------------------------------------------
Далее в справочных целях приведены уведомления об авторских правах из
первоначально заимствованного кода.
---
Original source code was derived from LMDB in 2015,
and later evolutionarily rewritten in 2015-2024:
Copyright (c) 2011-2015 Howard Chu, Symas Corp. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted only as authorized by the OpenLDAP modification, are permitted only as authorized by the OpenLDAP
@ -143,17 +11,12 @@ A copy of this license is available in the file LICENSE in the
top-level directory of the distribution or, alternatively, at top-level directory of the distribution or, alternatively, at
<http://www.OpenLDAP.org/license.html>. <http://www.OpenLDAP.org/license.html>.
LMDB itself devived code from btree.c written by Martin Hedenfalk: OpenLDAP is a registered trademark of the OpenLDAP Foundation.
Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
Permission to use, copy, modify, and distribute this software for any Individual files and/or contributed packages may be copyright by
purpose with or without fee is hereby granted, provided that the above other parties and/or subject to additional restrictions.
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES This work also contains materials derived from public sources.
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR Additional information about OpenLDAP can be obtained at
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES <http://www.openldap.org/>.
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

206
LICENSE
View File

@ -1,177 +1,47 @@
The OpenLDAP Public License
Version 2.8, 17 August 2003
Apache License Redistribution and use of this software and associated documentation
Version 2.0, January 2004 ("Software"), with or without modification, are permitted provided
http://www.apache.org/licenses/ that the following conditions are met:
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Redistributions in source form must retain copyright statements
and notices,
1. Definitions. 2. Redistributions in binary form must reproduce applicable copyright
statements and notices, this list of conditions, and the following
disclaimer in the documentation and/or other materials provided
with the distribution, and
"License" shall mean the terms and conditions for use, reproduction, 3. Redistributions must contain a verbatim copy of this document.
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by The OpenLDAP Foundation may revise this license from time to time.
the copyright owner that is granting the License. Each revision is distinguished by a version number. You may use
this Software under terms of this license revision or under the
terms of any subsequent revision of the license.
"Legal Entity" shall mean the union of the acting entity and all THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS
other entities that control, are controlled by, or are under common CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
control with that entity. For the purposes of this definition, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
"control" means (i) the power, direct or indirect, to cause the AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
direction or management of such entity, whether by contract or SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S)
otherwise, or (ii) ownership of fifty percent (50%) or more of the OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
outstanding shares, or (iii) beneficial ownership of such entity. INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"You" (or "Your") shall mean an individual or Legal Entity The names of the authors and copyright holders must not be used in
exercising permissions granted by this License. advertising or otherwise to promote the sale, use or other dealing
in this Software without specific, written prior permission. Title
to copyright in this Software shall at all times remain with copyright
holders.
"Source" form shall mean the preferred form for making modifications, OpenLDAP is a registered trademark of the OpenLDAP Foundation.
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical Copyright 1999-2003 The OpenLDAP Foundation, Redwood City,
transformation or translation of a Source form, including but California, USA. All Rights Reserved. Permission to copy and
not limited to compiled object code, generated documentation, distribute verbatim copies of this document is granted.
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

View File

@ -1,16 +1,5 @@
# This is thunk-Makefile for calling GNU Make 3.80 or above all bench bench-quartet build-test check clean clean-bench cross-gcc cross-qemu dist gcc-analyzer install mdbx memcheck reformat release-assets strip test test-asan test-fault test-leak test-singleprocess test-ubsan test-valgrind tools:
all help options cmake-build ninja \
clean install install-no-strip install-strip strip tools uninstall \
bench bench-clean bench-couple bench-quartet bench-triplet re-bench \
lib libs lib-static lib-shared tools-static \
libmdbx mdbx mdbx_chk mdbx_copy mdbx_drop mdbx_dump mdbx_load mdbx_stat \
check dist memcheck cross-gcc cross-qemu doxygen gcc-analyzer reformat \
release-assets tags build-test mdbx_test \
smoke smoke-fault smoke-singleprocess smoke-assertion smoke-memcheck \
test test-assertion test-long test-long-assertion test-ci test-ci-extra \
test-asan test-leak test-singleprocess test-ubsan test-memcheck:
@CC=$(CC) \ @CC=$(CC) \
CXX=`if test -n "$(CXX)" && which "$(CXX)" > /dev/null; then echo "$(CXX)"; elif test -n "$(CCC)" && which "$(CCC)" > /dev/null; then echo "$(CCC)"; else echo "c++"; fi` \ CXX=`if test -n "$(CXX)" && which "$(CXX)" > /dev/null; then echo "$(CXX)"; elif test -n "$(CCC)" && which "$(CCC)" > /dev/null; then echo "$(CCC)"; else echo "c++"; fi` \
`which gmake || which gnumake || echo 'echo "GNU Make 3.80 or above is required"; exit 2;'` \ `which gmake || which gnumake || echo 'echo "GNU Make is required"; exit 2;'` \
$(MAKEFLAGS) -f GNUmakefile $@ $(MAKEFLAGS) -f GNUmakefile $@

39
NOTICE
View File

@ -1,39 +0,0 @@
libmdbx (aka MDBX) is an extremely fast, compact, powerful, embeddedable,
transactional key-value storage engine with open-source code. MDBX has a
specific set of properties and capabilities, focused on creating unique
lightweight solutions.
Please visit https://libmdbx.dqdkfa.ru for more information, changelog,
documentation, C++ API description and links to the original git repo
with the source code. Questions, feedback and suggestions are welcome
to the Telegram' group https://t.me/libmdbx.
Donations are welcome to the Ethereum/ERC-20 `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
Всё будет хорошо!
Copyright 2015-2025 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru>
SPDX-License-Identifier: Apache-2.0
For notes about the license change, credits and acknowledgments,
please refer to the COPYRIGHT file within libmdbx source.
---
On 2022-04-15, without any warnings or following explanations, the
Github administration deleted _libmdbx_, my account and all other
projects (status 404). A few months later, without any involvement or
notification from/to me, the projects were restored/opened in the "public
read-only archive" status from some kind of incomplete backup. I regard
these actions of Github as malicious sabotage, and I consider the Github
service itself to have lost trust forever.
As a result of what has happened, I will never, under any circumstances,
post the primary sources (aka origins) of my projects on Github, or rely
in any way on the Github infrastructure.
Nevertheless, realizing that it is more convenient for users of
_libmdbx_ and other my projects to access ones on Github, I do not want
to restrict their freedom or create inconvenience, and therefore I place
mirrors (aka mirrors) of such repositories on Github since 2025. At the
same time, I would like to emphasize once again that these are only
mirrors that can be frozen, blocked or deleted at any time, as was the
case in 2022.

709
README.md
View File

@ -1,20 +1,18 @@
<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences --> <!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences -->
libmdbx libmdbx
======== =======
<!-- section-begin overview --> _libmdbx_ is an extremely fast, compact, powerful, embedded,
transactional [key-value store](https://en.wikipedia.org/wiki/Key-value_database)
database, with [permissive license](LICENSE).
_MDBX_ has a specific set of properties and capabilities,
focused on creating unique lightweight solutions with extraordinary performance.
_libmdbx_ is an extremely fast, compact, powerful, embedded, transactional 1. Allows **swarm of multi-threaded processes to
[key-value database](https://en.wikipedia.org/wiki/Key-value_database), [ACID]((https://en.wikipedia.org/wiki/ACID))ly read and update** several
with [Apache 2.0 license](https://gitflic.ru/project/erthink/libmdbx/blob?file=LICENSE).
_libmdbx_ has a specific set of properties and capabilities,
focused on creating unique lightweight solutions.
1. Allows **a swarm of multi-threaded processes to
[ACID](https://en.wikipedia.org/wiki/ACID)ly read and update** several
key-value [maps](https://en.wikipedia.org/wiki/Associative_array) and key-value [maps](https://en.wikipedia.org/wiki/Associative_array) and
[multimaps](https://en.wikipedia.org/wiki/Multimap) in a locally-shared [multimaps](https://en.wikipedia.org/wiki/Multimap) in a localy-shared
database. database.
2. Provides **extraordinary performance**, minimal overhead through 2. Provides **extraordinary performance**, minimal overhead through
@ -22,165 +20,66 @@ database.
`Olog(N)` operations costs by virtue of [B+ `Olog(N)` operations costs by virtue of [B+
tree](https://en.wikipedia.org/wiki/B%2B_tree). tree](https://en.wikipedia.org/wiki/B%2B_tree).
3. Requires **no maintenance and no crash recovery** since it doesn't use 3. Requires **no maintenance and no crash recovery** since doesn't use
[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging), but that might [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging), but that might
be a caveat for write-intensive workloads with durability requirements. be a caveat for write-intensive workloads with durability requirements.
4. Enforces [serializability](https://en.wikipedia.org/wiki/Serializability) for 4. **Compact and friendly for fully embeddeding**. Only 25KLOC of `C11`,
64K x86 binary code, no internal threads neither processes, but
implements a simplified variant of the [Berkeley
DB](https://en.wikipedia.org/wiki/Berkeley_DB) and
[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API.
5. Enforces [serializability](https://en.wikipedia.org/wiki/Serializability) for
writers just by single writers just by single
[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords
[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom) [wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom)
for parallel readers without atomic/interlocked operations, while for parallel readers without atomic/interlocked operations, while
**writing and reading transactions do not block each other**. **writing and reading transactions do not block each other**.
5. **Guarantee data integrity** after crash unless this was explicitly 6. **Guarantee data integrity** after crash unless this was explicitly
neglected in favour of write performance. neglected in favour of write performance.
6. Supports Linux, Windows, MacOS, Android, iOS, FreeBSD, DragonFly, Solaris, 7. Supports Linux, Windows, MacOS, Android, iOS, FreeBSD, DragonFly, Solaris,
OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with
**POSIX.1-2008**. **POSIX.1-2008**.
7. **Compact and friendly for fully embedding**. Only ≈25KLOC of `C11`, Historically, _MDBX_ is deeply revised and extended descendant of amazing
≈64K x86 binary code of core, no internal threads neither server process(es),
but implements a simplified variant of the [Berkeley
DB](https://en.wikipedia.org/wiki/Berkeley_DB) and
[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API.
<!-- section-end -->
Historically, _libmdbx_ is a deeply revised and extended descendant of the legendary
[Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
_libmdbx_ inherits all benefits from _LMDB_, but resolves some issues and adds [a set of improvements](#improvements-beyond-lmdb). _MDBX_ inherits all benefits from _LMDB_, but resolves some issues and adds set of improvements.
[![Telergam: Support | Discussions | News](https://img.shields.io/endpoint?color=scarlet&logo=telegram&label=Support%20%7C%20Discussions%20%7C%20News&url=https%3A%2F%2Ftg.sumanjay.workers.dev%2Flibmdbx)](https://t.me/libmdbx) The next version is under active non-public development from scratch and will be
released as **_MithrilDB_** and `libmithrildb` for libraries & packages.
> Please refer to the online [documentation](https://libmdbx.dqdkfa.ru)
> with [`C` API description](https://libmdbx.dqdkfa.ru/group__c__api.html)
> and pay attention to the [`C++` API](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h%2B%2B#line-num-1).
> Donations are welcome to the Ethereum/ERC-20 `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
> Всё будет хорошо!
Telegram Group archive: [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
[2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
[5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html).
## Github
### на Русском (мой родной язык)
Весной 2022, без каких-либо предупреждений или пояснений, администрация
Github удалила мой аккаунт и все проекты. Через несколько месяцев, без
какого-либо моего участия или уведомления, проекты были
восстановлены/открыты в статусе "public read-only archive" из какой-то
неполноценной резервной копии. Эти действия Github я расцениваю как
злонамеренный саботаж, а сам сервис Github считаю навсегда утратившим
какое-либо доверие.
Вследствие произошедшего, никогда и ни при каких условиях, я не буду
размещать на Github первоисточники (aka origins) моих проектов, либо
как-либо полагаться на инфраструктуру Github.
Тем не менее, понимая что пользователям моих проектов удобнее получать к
ним доступ именно на Github, я не хочу ограничивать их свободу или
создавать неудобство, и поэтому размещаю на Github зеркала (aka mirrors)
репозиториев моих проектов. При этом ещё раз акцентирую внимание, что
это только зеркала, которые могут быть заморожены, заблокированы или
удалены в любой момент, как это уже было в 2022.
### in English
In the spring of 2022, without any warnings or explanations, the Github
administration deleted my account and all projects. A few months later,
without any involvement or notification from me, the projects were
restored/opened in the "public read-only archive" status from some kind
of incomplete backup. I regard these actions of Github as malicious
sabotage, and I consider the Github service itself to have lost any
trust forever.
As a result of what has happened, I will never, under any circumstances,
post the primary sources (aka origins) of my projects on Github, or rely
in any way on the Github infrastructure.
Nevertheless, realizing that it is more convenient for users of my
projects to access them on Github, I do not want to restrict their
freedom or create inconvenience, and therefore I place mirrors of my
project repositories on Github. At the same time, I would like to
emphasize once again that these are only mirrors that can be frozen,
blocked or deleted at any time, as was the case in 2022.
## MithrilDB and Future
<!-- section-begin mithril -->
The next version is under non-public development from scratch and will be
released as **MithrilDB** and `libmithrildb` for libraries & packages.
Admittedly mythical [Mithril](https://en.wikipedia.org/wiki/Mithril) is Admittedly mythical [Mithril](https://en.wikipedia.org/wiki/Mithril) is
resembling silver but being stronger and lighter than steel. Therefore resembling silver but being stronger and lighter than steel. Therefore
_MithrilDB_ is a rightly relevant name. _MithrilDB_ is rightly relevant name.
> _MithrilDB_ will be radically different from _libmdbx_ by the new
> database format and API based on C++17, as well as the [Apache 2.0
> License](https://www.apache.org/licenses/LICENSE-2.0). The goal of this
> revolution is to provide a clearer and robust API, add more features and
> new valuable properties of database.
_MithrilDB_ is radically different from _libmdbx_ by the new database [![https://t.me/libmdbx](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/telegram.png)](https://t.me/libmdbx)
format and API based on C++20. The goal of this revolution is to provide [![Build Status](https://travis-ci.org/erthink/libmdbx.svg?branch=master)](https://travis-ci.org/erthink/libmdbx)
a clearer and robust API, add more features and new valuable properties [![Build status](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master)
of the database. All fundamental architectural problems of libmdbx/LMDB [![CircleCI](https://circleci.com/gh/erthink/libmdbx/tree/master.svg?style=svg)](https://circleci.com/gh/erthink/libmdbx/tree/master)
have been solved there, but now the active development has been [![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx)
suspended for top-three reasons: [![Build Status](https://api.cirrus-ci.com/github/erthink/libmdbx.svg)](https://cirrus-ci.com/github/erthink/libmdbx)
1. For now _libmdbx_ mostly enough and Im busy for scalability. *The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.*
2. Waiting for fresh [Elbrus CPU](https://wiki.elbrus.ru/) of [e2k architecture](https://en.wikipedia.org/wiki/Elbrus_2000),
especially with hardware acceleration of [Streebog](https://en.wikipedia.org/wiki/Streebog) and
[Kuznyechik](https://en.wikipedia.org/wiki/Kuznyechik), which are required for Merkle tree, etc.
3. The expectation of needs and opportunities due to the wide use of NVDIMM (aka persistent memory),
modern NVMe and [Ангара](https://ru.wikipedia.org/wiki/Ангара_(интерконнект)).
However, _MithrilDB_ will not be available for countries unfriendly to
Russia (i.e. acceded the sanctions, devil adepts and/or NATO). But it is
not yet known whether such restriction will be implemented only through
a license and support, either the source code will not be open at all.
Basically I am not inclined to allow my work to contribute to the
profit that goes to weapons that kill my relatives and friends.
NO OPTIONS.
Nonetheless, I try not to make any promises regarding _MithrilDB_ until release.
Contrary to _MithrilDB_, _libmdbx_ will forever free and open source.
Moreover with high-quality support whenever possible. Tu deviens
responsable pour toujours de ce que tu as apprivois. So I will continue
to comply with the original open license and the principles of
constructive cooperation, in spite of outright Github sabotage and
sanctions. I will also try to keep (not drop) Windows support, despite
it is an unused obsolete technology for us.
<!-- section-end -->
```
$ objdump -f -h -j .text libmdbx.so
libmdbx.so: формат файла elf64-e2k
архитектура: elbrus-v6:64, флаги 0x00000150:
HAS_SYMS, DYNAMIC, D_PAGED
начальный адрес 0x00000000??????00
Разделы:
Idx Name Разм VMA LMA Фа смещ. Выр. Флаги
10 .text 000e7460 0000000000025c00 0000000000025c00 00025c00 2**10 CONTENTS, ALLOC, LOAD, READONLY, CODE
$ cc --version
lcc:1.27.14:Jan-31-2024:e2k-v6-linux
gcc (GCC) 9.3.0 compatible
```
----- -----
## Table of Contents ## Table of Contents
- [Characteristics](#characteristics) - [Overview](#overview)
- [Features](#features) - [Features](#features)
- [Limitations](#limitations) - [Limitations](#limitations)
- [Gotchas](#gotchas) - [Caveats & Gotchas](#caveats--gotchas)
- [Comparison with other databases](#comparison-with-other-databases) - [Comparison with other databases](#comparison-with-other-databases)
- [Improvements beyond LMDB](#improvements-beyond-lmdb) - [Improvements beyond LMDB](#improvements-beyond-lmdb)
- [History & Acknowledgments](#history) - [History & Acknowledgments](#history)
- [Usage](#usage) - [Usage](#usage)
- [Building and Testing](#building-and-testing) - [Building](#building)
- [API description](#api-description) - [API description](#api-description)
- [Bindings](#bindings) - [Bindings](#bindings)
- [Performance comparison](#performance-comparison) - [Performance comparison](#performance-comparison)
@ -191,9 +90,7 @@ $ cc --version
- [Async-write mode](#async-write-mode) - [Async-write mode](#async-write-mode)
- [Cost comparison](#cost-comparison) - [Cost comparison](#cost-comparison)
# Characteristics # Overview
<!-- section-begin characteristics -->
## Features ## Features
@ -203,7 +100,7 @@ $ cc --version
[MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) [MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control)
and [CoW](https://en.wikipedia.org/wiki/Copy-on-write). and [CoW](https://en.wikipedia.org/wiki/Copy-on-write).
- Multiple key-value tables/sub-databases within a single datafile. - Multiple key-value sub-databases within a single datafile.
- Range lookups, including range query estimation. - Range lookups, including range query estimation.
@ -215,13 +112,13 @@ and [CoW](https://en.wikipedia.org/wiki/Copy-on-write).
- Transactions for readers and writers, ones do not block others. - Transactions for readers and writers, ones do not block others.
- Writes are strongly serialized. No transaction conflicts nor deadlocks. - Writes are strongly serialized. No transactions conflicts nor deadlocks.
- Readers are [non-blocking](https://en.wikipedia.org/wiki/Non-blocking_algorithm), notwithstanding [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation). - Readers are [non-blocking](https://en.wikipedia.org/wiki/Non-blocking_algorithm), notwithstanding [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation).
- Nested write transactions. - Nested write transactions.
- Reads scale linearly across CPUs. - Reads scales linearly across CPUs.
- Continuous zero-overhead database compactification. - Continuous zero-overhead database compactification.
@ -235,54 +132,43 @@ and [CoW](https://en.wikipedia.org/wiki/Copy-on-write).
- Append operation for efficient bulk insertion of pre-sorted data. - Append operation for efficient bulk insertion of pre-sorted data.
- No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) nor any transaction journal. No crash recovery needed. No maintenance is required. - No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) nor any
transaction journal. No crash recovery needed. No maintenance is required.
- No internal cache and/or memory management, all done by basic OS services. - No internal cache and/or memory management, all done by basic OS services.
## Limitations ## Limitations
- **Page size**: a power of 2, minimum `256` (mostly for testing), maximum `65536` bytes, default `4096` bytes. - **Page size**: a power of 2, maximum `65536` bytes, default `4096` bytes.
- **Key size**: minimum `0`, maximum ≈½ pagesize (`2022` bytes for default 4K pagesize, `32742` bytes for 64K pagesize). - **Key size**: minimum 0, maximum ≈¼ pagesize (`1300` bytes for default 4K pagesize, `21780` bytes for 64K pagesize).
- **Value size**: minimum `0`, maximum `2146435072` (`0x7FF00000`) bytes for maps, ≈½ pagesize for multimaps (`2022` bytes for default 4K pagesize, `32742` bytes for 64K pagesize). - **Value size**: minimum 0, maximum `2146435072` (`0x7FF00000`) bytes for maps, ≈¼ pagesize for multimaps (`1348` bytes default 4K pagesize, `21828` bytes for 64K pagesize).
- **Write transaction size**: up to `1327217884` pages (`4.944272` TiB for default 4K pagesize, `79.108351` TiB for 64K pagesize). - **Write transaction size**: up to `4194301` (`0x3FFFFD`) pages (16 [GiB](https://en.wikipedia.org/wiki/Gibibyte) for default 4K pagesize, 256 [GiB](https://en.wikipedia.org/wiki/Gibibyte) for 64K pagesize).
- **Database size**: up to `2147483648` pages (`8.0` TiB for default 4K pagesize, `128.0` TiB for 64K pagesize). - **Database size**: up to `2147483648` pages (8 [TiB](https://en.wikipedia.org/wiki/Tebibyte) for default 4K pagesize, 128 [TiB](https://en.wikipedia.org/wiki/Tebibyte) for 64K pagesize).
- **Maximum tables/sub-databases**: `32765`. - **Maximum sub-databases**: `32765`.
## Gotchas ## Caveats & Gotchas
1. There cannot be more than one writer at a time, i.e. no more than one write transaction at a time. 1. There cannot be more than one writer at a time, i.e. no more than one write transaction at a time.
2. _libmdbx_ is based on [B+ tree](https://en.wikipedia.org/wiki/B%2B_tree), so access to database pages is mostly random. 2. MDBX is based on [B+ tree](https://en.wikipedia.org/wiki/B%2B_tree), so access to database pages is mostly random.
Thus SSDs provide a significant performance boost over spinning disks for large databases. Thus SSDs provide a significant performance boost over spinning disks for large databases.
3. _libmdbx_ uses [shadow paging](https://en.wikipedia.org/wiki/Shadow_paging) instead of [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging). 3. MDBX uses [shadow paging](https://en.wikipedia.org/wiki/Shadow_paging) instead of [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging). Thus syncing data to disk might be bottleneck for write intensive workload.
Thus syncing data to disk might be a bottleneck for write intensive workload.
4. _libmdbx_ uses [copy-on-write](https://en.wikipedia.org/wiki/Copy-on-write) for [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation) during updates, 4. MDBX uses [copy-on-write](https://en.wikipedia.org/wiki/Copy-on-write) for [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation) during updates, but read transactions prevents recycling an old retired/freed pages, since it read ones. Thus altering of data during a parallel
but read transactions prevents recycling an old retired/freed pages, since it read ones. Thus altering of data during a parallel
long-lived read operation will increase the process work set, may exhaust entire free database space, long-lived read operation will increase the process work set, may exhaust entire free database space,
the database can grow quickly, and result in performance degradation. the database can grow quickly, and result in performance degradation.
Try to avoid long running read transactions, otherwise use [transaction parking](https://libmdbx.dqdkfa.ru/group__c__transactions.html#ga2c2c97730ff35cadcedfbd891ac9b12f) Try to avoid long running read transactions.
and/or [Handle-Slow-Readers callback](https://libmdbx.dqdkfa.ru/group__c__err.html#ga2cb11b56414c282fe06dd942ae6cade6).
5. _libmdbx_ is extraordinarily fast and provides minimal overhead for data access, 5. MDBX is extraordinarily fast and provides minimal overhead for data access,
so you should reconsider using brute force techniques and double check your code. so you should reconsider about use brute force techniques and double check your code.
On the one hand, in the case of _libmdbx_, a simple linear search may be more profitable than complex indexes. On the one hand, in the case of MDBX, a simple linear search may be more profitable than complex indexes.
On the other hand, if you make something suboptimally, you can notice detrimentally only on sufficiently large data. On the other hand, if you make something suboptimally, you can notice a detrimentally only on sufficiently large data.
## Comparison with other databases ### Comparison with other databases
For now please refer to [chapter of "BoltDB comparison with other For now please refer to [chapter of "BoltDB comparison with other
databases"](https://github.com/coreos/bbolt#comparison-with-other-databases) databases"](https://github.com/coreos/bbolt#comparison-with-other-databases)
which is also (mostly) applicable to _libmdbx_ with minor clarification: which is also (mostly) applicable to _libmdbx_.
- a database could shared by multiple processes, i.e. no multi-process issues;
- no issues with moving a cursor(s) after the deletion;
- _libmdbx_ provides zero-overhead database compactification, so a database file could be shrinked/truncated in particular cases;
- excluding disk I/O time _libmdbx_ could be ≈3 times faster than BoltDB and up to 10-100K times faster than both BoltDB and LMDB in particular extreme cases;
- _libmdbx_ provides more features compared to BoltDB and/or LMDB.
<!-- section-end -->
<!-- section-begin improvements -->
Improvements beyond LMDB Improvements beyond LMDB
======================== ========================
@ -294,89 +180,79 @@ out-of-the-box, not silently and catastrophically break down. The list
below is pruned down to the improvements most notable and obvious from below is pruned down to the improvements most notable and obvious from
the user's point of view. the user's point of view.
## Some Added Features ### Added Features:
1. Keys could be more than 2 times longer than _LMDB_. 1. Keys could be more than 2 times longer than _LMDB_.
> For DB with default page size _libmdbx_ support keys up to 2022 bytes > For DB with default page size _libmdbx_ support keys up to 1300 bytes
> and up to 32742 bytes for 64K page size. _LMDB_ allows key size up to > and up to 21780 bytes for 64K page size. _LMDB_ allows key size up to
> 511 bytes and may silently loses data with large values. > 511 bytes and may silently loses data with large values.
2. Up to 30% faster than _LMDB_ in [CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete) benchmarks. 2. Up to 20% faster than _LMDB_ in [CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete) benchmarks.
> Benchmarks of the in-[tmpfs](https://en.wikipedia.org/wiki/Tmpfs) scenarios, > Benchmarks of the in-[tmpfs](https://en.wikipedia.org/wiki/Tmpfs) scenarios,
> that tests the speed of the engine itself, showned that _libmdbx_ 10-20% faster than _LMDB_, > that tests the speed of engine itself, shown that _libmdbx_ 10-20% faster than _LMDB_.
> and up to 30% faster when _libmdbx_ compiled with specific build options > These and other results could be easily reproduced with [ioArena](https://github.com/pmwkaa/ioarena) just by `make bench-quartet`,
> which downgrades several runtime checks to be match with LMDB behaviour. > including comparisons with [RockDB](https://en.wikipedia.org/wiki/RocksDB)
> > and [WiredTiger](https://en.wikipedia.org/wiki/WiredTiger).
> However, libmdbx may be slower than LMDB on Windows, since uses native file locking API.
> These locks are really slow, but they prevent an inconsistent backup from being obtained by copying the DB file during an ongoing write transaction.
> So I think this is the right decision, and for speed, it's better to use Linux, or ask Microsoft to fix up file locks.
>
> Noted above and other results could be easily reproduced with [ioArena](https://abf.io/erthink/ioarena) just by `make bench-quartet` command,
> including comparisons with [RockDB](https://en.wikipedia.org/wiki/RocksDB)
> and [WiredTiger](https://en.wikipedia.org/wiki/WiredTiger).
3. Automatic on-the-fly database size adjustment, both increment and reduction. 3. Automatic on-the-fly database size adjustment, both increment and reduction.
> _libmdbx_ manages the database size according to parameters specified > _libmdbx_ manage the database size according to parameters specified
> by `mdbx_env_set_geometry()` function, > by `mdbx_env_set_geometry()` function,
> ones include the growth step and the truncation threshold. > ones include the growth step and the truncation threshold.
> >
> Unfortunately, on-the-fly database size adjustment doesn't work under [Wine](https://en.wikipedia.org/wiki/Wine_(software)) > Unfortunately, on-the-fly database size adjustment doesn't work under [Wine](https://en.wikipedia.org/wiki/Wine_(software))
> due to its internal limitations and unimplemented functions, i.e. the `MDBX_UNABLE_EXTEND_MAPSIZE` error will be returned. > due to its internal limitations and unimplemented functions, i.e. the `MDBX_UNABLE_EXTEND_MAPSIZE` error will be returned.
4. Automatic continuous zero-overhead database compactification. 4. Automatic continuous zero-overhead database compactification.
> During each commit _libmdbx_ merges a freeing pages which adjacent with the unallocated area > During each commit _libmdbx_ merges suitable freeing pages into unallocated area
> at the end of file, and then truncates unused space when a lot enough of. > at the end of file, and then truncate unused space when a lot enough of.
5. The same database format for 32- and 64-bit builds. 5. The same database format for 32- and 64-bit builds.
> _libmdbx_ database format depends only on the [endianness](https://en.wikipedia.org/wiki/Endianness) but not on the [bitness](https://en.wiktionary.org/wiki/bitness). > _libmdbx_ database format depends only on the [endianness](https://en.wikipedia.org/wiki/Endianness) but not on the [bitness](https://en.wiktionary.org/wiki/bitness).
6. The "Big Foot" feature than solves speific performance issues with huge transactions and extra-large page-number-lists. 6. LIFO policy for Garbage Collection recycling. This can significantly increase write performance due write-back disk cache up to several times in a best case scenario.
> LIFO means that for reuse will be taken latest became unused pages.
> Therefore the loop of database pages circulation becomes as short as possible.
> In other words, the set of pages, that are (over)written in memory and on disk during a series of write transactions, will be as small as possible.
> Thus creates ideal conditions for the battery-backed or flash-backed disk cache efficiency.
7. LIFO policy for Garbage Collection recycling. This can significantly increase write performance due write-back disk cache up to several times in a best case scenario. 7. Fast estimation of range query result volume, i.e. how many items can
> LIFO means that for reuse will be taken the latest becomes unused pages. be found between a `KEY1` and a `KEY2`. This is prerequisite for build
> Therefore the loop of database pages circulation becomes as short as possible.
> In other words, the set of pages, that are (over)written in memory and on disk during a series of write transactions, will be as small as possible.
> Thus creates ideal conditions for the battery-backed or flash-backed disk cache efficiency.
8. Parking of read transactions with ousting and auto-restart, [Handle-Slow-Readers callback](https://libmdbx.dqdkfa.ru/group__c__err.html#ga2cb11b56414c282fe06dd942ae6cade6) to resolve an issues due to long-lived read transactions.
9. Fast estimation of range query result volume, i.e. how many items can
be found between a `KEY1` and a `KEY2`. This is a prerequisite for build
and/or optimize query execution plans. and/or optimize query execution plans.
> _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys. > _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys.
10. Database integrity check API both with standalone `mdbx_chk` utility. 8. `mdbx_chk` tool for database integrity check.
11. Support for opening databases in the exclusive mode, including on a network share. 9. Automated steady sync-to-disk upon several thresholds and/or timeout via cheap polling.
12. Extended information of whole-database, tables/sub-databases, transactions, readers enumeration. 10. Sequence generation and three persistent 64-bit markers.
> _libmdbx_ provides a lot of information, including dirty and leftover pages
> for a write transaction, reading lag and holdover space for read transactions.
13. Support of Zero-length for keys and values. 11. Callback for lack-of-space condition of database that allows you to control and/or resolve such situations.
14. Useful runtime options for tuning engine to application's requirements and use cases specific. 12. Support for opening database in the exclusive mode, including on a network share.
15. Automated steady sync-to-disk upon several thresholds and/or timeout via cheap polling. ### Added Abilities:
16. Ability to determine whether the particular data is on a dirty page 1. Zero-length for keys and values.
2. Ability to determine whether the particular data is on a dirty page
or not, that allows to avoid copy-out before updates. or not, that allows to avoid copy-out before updates.
17. Extended update and delete operations. 3. Ability to determine whether the cursor is pointed to a key-value
> _libmdbx_ allows one _at once_ with getting previous value pair, to the first, to the last, or not set to anything.
> and addressing the particular item from multi-value with the same key.
18. Sequence generation and three persistent 64-bit vector-clock like markers. 4. Extended information of whole-database, sub-databases, transactions, readers enumeration.
> _libmdbx_ provides a lot of information, including dirty and leftover pages
> for a write transaction, reading lag and holdover space for read transactions.
## Other fixes and specifics 5. Extended update and delete operations.
> _libmdbx_ allows ones _at once_ with getting previous value
> and addressing the particular item from multi-value with the same key.
1. Fixed more than 10 significant errors, in particular: page leaks, ### Other fixes and specifics:
wrong table/sub-database statistics, segfault in several conditions,
nonoptimal page merge strategy, updating an existing record with
a change in data size (including for multimap), etc.
2. All cursors can be reused and should be closed explicitly, 1. Fixed more than 10 significant errors, in particular: page leaks, wrong sub-database statistics, segfault in several conditions, unoptimal page merge strategy, updating an existing record with a change in data size (including for multimap), etc.
regardless ones were opened within a write or read transaction.
2. All cursors can be reused and should be closed explicitly, regardless ones were opened within write or read transaction.
3. Opening database handles are spared from race conditions and 3. Opening database handles are spared from race conditions and
pre-opening is not needed. pre-opening is not needed.
@ -384,16 +260,17 @@ pre-opening is not needed.
4. Returning `MDBX_EMULTIVAL` error in case of ambiguous update or delete. 4. Returning `MDBX_EMULTIVAL` error in case of ambiguous update or delete.
5. Guarantee of database integrity even in asynchronous unordered write-to-disk mode. 5. Guarantee of database integrity even in asynchronous unordered write-to-disk mode.
> _libmdbx_ propose additional trade-off by `MDBX_SAFE_NOSYNC` with append-like manner for updates, > _libmdbx_ propose additional trade-off by implementing append-like manner for updates
> that avoids database corruption after a system crash contrary to LMDB. > in `MDBX_SAFE_NOSYNC` and `MDBX_WRITEMAP|MDBX_MAPASYNC` modes, that avoid database corruption after a system crash
> Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode is available to match LMDB's behaviour for `MDB_NOSYNC`. > contrary to LMDB. Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode available to match LMDB behaviour,
> and for a special use-cases.
6. On **MacOS & iOS** the `fcntl(F_FULLFSYNC)` syscall is used _by 6. On **MacOS & iOS** the `fcntl(F_FULLFSYNC)` syscall is used _by
default_ to synchronize data with the disk, as this is [the only way to default_ to synchronize data with the disk, as this is [the only way to
guarantee data guarantee data
durability](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html) durability](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html)
in case of power failure. Unfortunately, in scenarios with high write in case of power failure. Unfortunately, in scenarios with high write
intensity, the use of `F_FULLFSYNC` significantly degrades performance intensity, the use of `F_FULLFSYNC` significant degrades performance
compared to LMDB, where the `fsync()` syscall is used. Therefore, compared to LMDB, where the `fsync()` syscall is used. Therefore,
_libmdbx_ allows you to override this behavior by defining the _libmdbx_ allows you to override this behavior by defining the
`MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` option while build the library. `MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` option while build the library.
@ -402,182 +279,66 @@ _libmdbx_ allows you to override this behavior by defining the
it allows place the database on network drives, and provides protection it allows place the database on network drives, and provides protection
against incompetent user actions (aka against incompetent user actions (aka
[poka-yoke](https://en.wikipedia.org/wiki/Poka-yoke)). Therefore [poka-yoke](https://en.wikipedia.org/wiki/Poka-yoke)). Therefore
_libmdbx_ may be a little lag in performance tests from LMDB where the _libmdbx_ may be a little lag in performance tests from LMDB where a
named mutexes are used. named mutexes are used.
<!-- section-end --> ### History
<!-- section-begin history -->
# History
Historically, _libmdbx_ is a deeply revised and extended descendant of the
[Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
At first the development was carried out within the At first the development was carried out within the
[ReOpenLDAP](https://web.archive.org/web/https://github.com/erthink/ReOpenLDAP) project. About a [ReOpenLDAP](https://github.com/erthink/ReOpenLDAP) project. About a
year later _libmdbx_ was separated into a standalone project, which was year later _libmdbx_ was separated into standalone project, which was
[presented at Highload++ 2015 [presented at Highload++ 2015
conference](http://www.highload.ru/2015/abstracts/1831.html). conference](http://www.highload.ru/2015/abstracts/1831.html).
Since 2017 _libmdbx_ is used in [Fast Positive Tables](https://gitflic.ru/project/erthink/libfpta), Since 2017 _libmdbx_ is used in [Fast Positive Tables](https://github.com/erthink/libfpta),
and until 2025 development was funded by [Positive Technologies](https://www.ptsecurity.com). and development is funded by [Positive Technologies](https://www.ptsecurity.com).
Since 2020 _libmdbx_ is used in Ethereum: [Erigon](https://github.com/erigontech/erigon), [Akula](https://github.com/akula-bft/akula),
[Silkworm](https://github.com/erigontech/silkworm), [Reth](https://github.com/paradigmxyz/reth), etc.
On 2022-04-15 the Github administration, without any warning nor ### Acknowledgments
explanation, deleted _libmdbx_ along with a lot of other projects, Howard Chu <hyc@openldap.org> is the author of LMDB, from which
simultaneously blocking access for many developers. Therefore on originated the MDBX in 2015.
2022-04-21 I have migrated to a reliable trusted infrastructure.
The origin for now is at [GitFlic](https://gitflic.ru/project/erthink/libmdbx)
with backup at [ABF by ROSA Лаб](https://abf.rosalinux.ru/erthink/libmdbx).
For the same reason ~~Github~~ is blacklisted forever.
Since May 2024 and version 0.13 _libmdbx_ was re-licensed under Apache-2.0 license.
Please refer to the [`COPYRIGHT` file](https://gitflic.ru/project/erthink/libmdbx/blob/raw?file=COPYRIGHT) for license change explanations.
## Acknowledgments
Howard Chu <hyc@openldap.org> and Hallvard Furuseth
<hallvard@openldap.org> are the authors of _LMDB_, from which _libmdbx_
was forked in 2015.
Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which
was used to begin development of _LMDB_. was used for begin development of LMDB.
<!-- section-end -->
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
Usage Usage
===== =====
<!-- section-begin usage -->
Currently, libmdbx is only available in a
[source code](https://en.wikipedia.org/wiki/Source_code) form.
Packages support for common Linux distributions is planned in the future,
since release the version 1.0.
## Source code embedding ## Source code embedding
_libmdbx_ provides three official ways for integration in source code form: _libmdbx_ provides two official ways for integration in source code form:
1. Using an amalgamated source code which available in the [releases section](https://gitflic.ru/project/erthink/libmdbx/release) on GitFlic. 1. Using the amalgamated source code.
> An amalgamated source code includes all files required to build and > The amalgamated source code includes all files requires to build and
> use _libmdbx_, but not for testing _libmdbx_ itself. > use _libmdbx_, but not for testing _libmdbx_ itself.
> Beside the releases an amalgamated sources could be created any time from the original clone of git
> repository on Linux by executing `make dist`. As a result, the desired
> set of files will be formed in the `dist` subdirectory.
2. Using [Conan Package Manager](https://conan.io/): 2. Adding the complete original source code as a `git submodule`.
- optional: Setup your own conan-server; > This allows you to build as _libmdbx_ and testing tool.
- Create conan-package by `conan create .` inside the _libmdbx_' repo subdirectory; > On the other hand, this way requires you to pull git tags, and use C++11 compiler for test tool.
- optional: Upload created recipe and/or package to the conan-server by `conan upload -r SERVER 'mdbx/*'`;
- Consume libmdbx-package from the local conan-cache or from conan-server in accordance with the [Conan tutorial](https://docs.conan.io/2/tutorial/consuming_packages.html).
3. Adding the complete source code as a `git submodule` from the [origin git repository](https://gitflic.ru/project/erthink/libmdbx) on GitFlic. **_Please, avoid using any other techniques._** Otherwise, at least
> This allows you to build as _libmdbx_ and testing tool.
> On the other hand, this way requires you to pull git tags, and use C++11 compiler for test tool.
_**Please, avoid using any other techniques.**_ Otherwise, at least
don't ask for support and don't name such chimeras `libmdbx`. don't ask for support and don't name such chimeras `libmdbx`.
The amalgamated source code could be created from original clone of git
repository on Linux by executing `make dist`. As a result, the desired
set of files will be formed in the `dist` subdirectory.
## Building
## Building and Testing
Both amalgamated and original source code provides build through the use Both amalgamated and original source code provides build through the use
[CMake](https://cmake.org/) or [GNU [CMake](https://cmake.org/) or [GNU
Make](https://www.gnu.org/software/make/) with Make](https://www.gnu.org/software/make/) with
[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)). [bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)). All build ways
All build ways
are completely traditional and have minimal prerequirements like are completely traditional and have minimal prerequirements like
`build-essential`, i.e. the non-obsolete C/C++ compiler and a `build-essential`, i.e. the non-obsolete C/C++ compiler and a
[SDK](https://en.wikipedia.org/wiki/Software_development_kit) for the [SDK](https://en.wikipedia.org/wiki/Software_development_kit) for the
target platform. target platform. Obviously you need building tools itself, i.e. `git`,
Obviously you need building tools itself, i.e. `git`, `cmake` or GNU `make` with `bash`.
`cmake` or GNU `make` with `bash`. For your convenience, `make help`
and `make options` are also available for listing existing targets
and build options respectively.
The only significant specificity is that git' tags are required So just use CMake or GNU Make in your habitual manner and feel free to
to build from complete (not amalgamated) source codes.
Executing **`git fetch --tags --force --prune`** is enough to get ones,
and `--unshallow` or `--update-shallow` is required for shallow cloned case.
So just using CMake or GNU Make in your habitual manner and feel free to
fill an issue or make pull request in the case something will be fill an issue or make pull request in the case something will be
unexpected or broken down. unexpected or broken down.
### Testing
The amalgamated source code does not contain any tests for or several reasons.
Please read [the explanation](https://libmdbx.dqdkfa.ru/dead-github/issues/214#issuecomment-870717981) and don't ask to alter this.
So for testing _libmdbx_ itself you need a full source code, i.e. the clone of a git repository, there is no option.
The full source code of _libmdbx_ has a [`test` subdirectory](https://gitflic.ru/project/erthink/libmdbx/tree/master/test) with minimalistic test "framework".
Actually yonder is a source code of the `mdbx_test` console utility which has a set of command-line options that allow construct and run a reasonable enough test scenarios.
This test utility is intended for _libmdbx_'s developers for testing library itself, but not for use by users.
Therefore, only basic information is provided:
- There are few CRUD-based test cases (hill, TTL, nested, append, jitter, etc),
which can be combined to test the concurrent operations within shared database in a multi-processes environment.
This is the `basic` test scenario.
- The `Makefile` provide several self-described targets for testing: `smoke`, `test`, `check`, `memcheck`, `test-valgrind`,
`test-asan`, `test-leak`, `test-ubsan`, `cross-gcc`, `cross-qemu`, `gcc-analyzer`, `smoke-fault`, `smoke-singleprocess`,
`test-singleprocess`, `long-test`. Please run `make --help` if doubt.
- In addition to the `mdbx_test` utility, there is the script [`stochastic.sh`](https://gitflic.ru/project/erthink/libmdbx/blob/master/test/stochastic.sh),
which calls `mdbx_test` by going through set of modes and options, with gradually increasing the number of operations and the size of transactions.
This script is used for mostly of all automatic testing, including `Makefile` targets and Continuous Integration.
- Brief information of available command-line options is available by `--help`.
However, you should dive into source code to get all, there is no option.
Anyway, no matter how thoroughly the _libmdbx_ is tested, you should rely only on your own tests for a few reasons:
1. Mostly of all use cases are unique.
So it is no warranty that your use case was properly tested, even the _libmdbx_'s tests engages stochastic approach.
2. If there are problems, then your test on the one hand will help to verify whether you are using _libmdbx_ correctly,
on the other hand it will allow to reproduce the problem and insure against regression in a future.
3. Actually you should rely on than you checked by yourself or take a risk.
### Common important details
#### Build reproducibility
By default _libmdbx_ track build time via `MDBX_BUILD_TIMESTAMP` build option and macro.
So for a [reproducible builds](https://en.wikipedia.org/wiki/Reproducible_builds) you should predefine/override it to known fixed string value.
For instance:
- for reproducible build with make: `make MDBX_BUILD_TIMESTAMP=unknown ` ...
- or during configure by CMake: `cmake -DMDBX_BUILD_TIMESTAMP:STRING=unknown ` ...
Of course, in addition to this, your toolchain must ensure the reproducibility of builds.
For more information please refer to [reproducible-builds.org](https://reproducible-builds.org/).
#### Containers
There are no special traits nor quirks if you use _libmdbx_ ONLY inside
the single container. But in a cross-container(s) or with a host-container(s)
interoperability cases the three major things MUST be guaranteed:
1. Coherence of memory mapping content and unified page cache inside OS
kernel for host and all container(s) operated with a DB. Basically this
means must be only a single physical copy of each memory mapped DB' page
in the system memory.
2. Uniqueness of [PID](https://en.wikipedia.org/wiki/Process_identifier) values and/or a common space for ones:
- for POSIX systems: PID uniqueness for all processes operated with a DB.
I.e. the `--pid=host` is required for run DB-aware processes inside Docker,
either without host interaction a `--pid=container:<name|id>` with the same name/id.
- for non-POSIX (i.e. Windows) systems: inter-visibility of processes handles.
I.e. the `OpenProcess(SYNCHRONIZE, ..., PID)` must return reasonable error,
including `ERROR_ACCESS_DENIED`,
but not the `ERROR_INVALID_PARAMETER` as for an invalid/non-existent PID.
3. The versions/builds of _libmdbx_ and `libc`/`pthreads` (`glibc`, `musl`, etc) must be be compatible.
- Basically, the `options:` string in the output of `mdbx_chk -V` must be the same for host and container(s).
See `MDBX_LOCKING`, `MDBX_USE_OFDLOCKS` and other build options for details.
- Avoid using different versions of `libc`, especially mixing different implementations, i.e. `glibc` with `musl`, etc.
Prefer to use the same LTS version, or switch to full virtualization/isolation if in doubt.
#### DSO/DLL unloading and destructors of Thread-Local-Storage objects #### DSO/DLL unloading and destructors of Thread-Local-Storage objects
When building _libmdbx_ as a shared library or use static _libmdbx_ as a When building _libmdbx_ as a shared library or use static _libmdbx_ as a
part of another dynamic library, it is advisable to make sure that your part of another dynamic library, it is advisable to make sure that your
@ -606,138 +367,101 @@ where there are no similar bugs in the pthreads implementation.
### Linux and other platforms with GNU Make ### Linux and other platforms with GNU Make
To build the library it is enough to execute `make all` in the directory To build the library it is enough to execute `make all` in the directory
of source code, and `make check` to execute the basic tests. of source code, and `make check` for execute the basic tests.
If the `make` installed on the system is not GNU Make, there will be a If the `make` installed on the system is not GNU Make, there will be a
lot of errors from make when trying to build. In this case, perhaps you lot of errors from make when trying to build. In this case, perhaps you
should use `gmake` instead of `make`, or even `gnu-make`, etc. should use `gmake` instead of `make`, or even `gnu-make`, etc.
### FreeBSD and related platforms ### FreeBSD and related platforms
As a rule on BSD and it derivatives the default is to use Berkeley Make and As a rule, in such systems, the default is to use Berkeley Make. And GNU
[Bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) is not installed. Make is called by the gmake command or may be missing. In addition,
[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) may be absent.
So you need to install the required components: GNU Make, Bash, C and C++ You need to install the required components: GNU Make, bash, C and C++
compilers compatible with GCC or CLANG. After that, to build the compilers compatible with GCC or CLANG. After that, to build the
library, it is enough to execute `gmake all` (or `make all`) in the library, it is enough execute `gmake all` (or `make all`) in the
directory with source code, and `gmake check` (or `make check`) to run directory with source code, and `gmake check` (or `make check`) to run
the basic tests. the basic tests.
### Windows ### Windows
For build _libmdbx_ on Windows the _original_ CMake and [Microsoft Visual For build _libmdbx_ on Windows the _original_ CMake and [Microsoft Visual
Studio 2019](https://en.wikipedia.org/wiki/Microsoft_Visual_Studio) are Studio 2019](https://en.wikipedia.org/wiki/Microsoft_Visual_Studio) are
recommended. Please use the recent versions of CMake, Visual Studio and Windows recommended. Otherwise do not forget add `ntdll.lib` to linking.
SDK to avoid troubles with C11 support and `alignas()` feature.
For build by MinGW the 10.2 or recent version coupled with a modern CMake are required. Building by MinGW, MSYS or Cygwin is potentially possible. However,
So it is recommended to use [chocolatey](https://chocolatey.org/) to install and/or update the ones. these scripts are not tested and will probably require you to modify the
CMakeLists.txt or Makefile respectively.
Another ways to build is potentially possible but not supported and will not. It should be noted that in _libmdbx_ was efforts to resolve
The `CMakeLists.txt` or `GNUMakefile` scripts will probably need to be modified accordingly. runtime dependencies from CRT and other libraries Visual Studio.
Using other methods do not forget to add the `ntdll.lib` to linking. For this is enough define the `MDBX_AVOID_CRT` during build.
It should be noted that in _libmdbx_ was efforts to avoid An example of running a basic test script can be found in the
runtime dependencies from CRT and other MSVC libraries. [CI-script](appveyor.yml) for [AppVeyor](https://www.appveyor.com/). To
For this is enough to pass the `-DMDBX_WITHOUT_MSVC_CRT:BOOL=ON` option run the [long stochastic test scenario](test/long_stochastic.sh),
during configure by CMake.
To run the [long stochastic test scenario](test/stochastic.sh),
[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) is required, and [bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) is required, and
such testing is recommended with placing the test data on the the such testing is recommended with place the test data on the
[RAM-disk](https://en.wikipedia.org/wiki/RAM_drive). [RAM-disk](https://en.wikipedia.org/wiki/RAM_drive).
### MacOS
Current [native build tools](https://en.wikipedia.org/wiki/Xcode) for
MacOS include GNU Make, CLANG and an outdated version of bash.
Therefore, to build the library, it is enough to run `make all` in the
directory with source code, and run `make check` to execute the base
tests. If something goes wrong, it is recommended to install
[Homebrew](https://brew.sh/) and try again.
To run the [long stochastic test scenario](test/long_stochastic.sh), you
will need to install the current (not outdated) version of
[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)). To do this, we
recommend that you install [Homebrew](https://brew.sh/) and then execute
`brew install bash`.
### Android
We recommend using CMake to build _libmdbx_ for Android.
Please refer to the [official guide](https://developer.android.com/studio/projects/add-native-code).
### iOS
To build _libmdbx_ for iOS, we recommend using CMake with the
"[toolchain file](https://cmake.org/cmake/help/latest/variable/CMAKE_TOOLCHAIN_FILE.html)"
from the [ios-cmake](https://github.com/leetal/ios-cmake) project.
### Windows Subsystem for Linux ### Windows Subsystem for Linux
_libmdbx_ could be used in [WSL2](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux#WSL_2) _libmdbx_ could be using in [WSL2](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux#WSL_2)
but NOT in [WSL1](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux#WSL_1) environment. but NOT in [WSL1](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux#WSL_1) environment.
This is a consequence of the fundamental shortcomings of _WSL1_ and cannot be fixed. This is a consequence of the fundamental shortcomings of _WSL1_ and cannot be fixed.
To avoid data loss, _libmdbx_ returns the `ENOLCK` (37, "No record locks available") To avoid data loss, _libmdbx_ returns the `ENOLCK` (37, "No record locks available")
error when opening the database in a _WSL1_ environment. error when opening the database in a _WSL1_ environment.
### MacOS
Current [native build tools](https://en.wikipedia.org/wiki/Xcode) for
MacOS include GNU Make, CLANG and an outdated version of Bash.
However, the build script uses GNU-kind of `sed` and `tar`.
So the easiest way to install all prerequirements is to use [Homebrew](https://brew.sh/),
just by `brew install bash make cmake ninja gnu-sed gnu-tar --with-default-names`.
Next, to build the library, it is enough to run `make all` in the
directory with source code, and run `make check` to execute the base
tests. If something goes wrong, it is recommended to install
[Homebrew](https://brew.sh/) and try again.
To run the [long stochastic test scenario](test/stochastic.sh), you
will need to install the current (not outdated) version of
[Bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)).
Just install it as noted above.
### Android
I recommend using CMake to build _libmdbx_ for Android.
Please refer to the [official guide](https://developer.android.com/studio/projects/add-native-code).
### iOS
To build _libmdbx_ for iOS, I recommend using CMake with the
["toolchain file"](https://cmake.org/cmake/help/latest/variable/CMAKE_TOOLCHAIN_FILE.html)
from the [ios-cmake](https://github.com/leetal/ios-cmake) project.
<!-- section-end -->
## API description ## API description
For more information and API description see the [mdbx.h](mdbx.h) header.
Please do not hesitate to point out errors in the documentation,
including creating [PR](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/proposing-changes-to-your-work-with-pull-requests) with corrections and improvements.
Please refer to the online [_libmdbx_ API reference](https://libmdbx.dqdkfa.ru/docs) ## Bindings
and/or see the [mdbx.h++](mdbx.h%2B%2B) and [mdbx.h](mdbx.h) headers.
<!-- section-begin bindings --> | Runtime | GitHub | Author |
| -------- | ------ | ------ |
Bindings | Rust | [mdbx-rs](https://github.com/Kerollmops/mdbx-rs) | [@Kerollmops](https://github.com/Kerollmops) |
======== | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) |
| .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) |
| Runtime | Repo | Author |
| ------- | ------ | ------ |
| Rust | [libmdbx-rs](https://github.com/vorot93/libmdbx-rs) | [Artem Vorotnikov](https://github.com/vorot93) |
| Python | [PyPi/libmdbx](https://pypi.org/project/libmdbx/) | [Lazymio](https://github.com/wtdcode) |
| Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) |
| Go | [mdbx-go](https://github.com/torquem-ch/mdbx-go) | [Alex Sharov](https://github.com/AskAlexSharov) |
| Ruby | [ruby-mdbx](https://rubygems.org/gems/mdbx/) | [Mahlon E. Smith](https://github.com/mahlonsmith) |
| Zig | [mdbx-zig](https://github.com/theseyan/lmdbx-zig) | [Sayan J. Das](https://github.com/theseyan) |
##### Obsolete/Outdated/Unsupported:
| Runtime | Repo | Author |
| ------- | ------ | ------ |
| .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) |
| Scala | [mdbx4s](https://github.com/david-bouyssie/mdbx4s) | [David Bouyssié](https://github.com/david-bouyssie) |
| Rust | [mdbx](https://crates.io/crates/mdbx) | [gcxfd](https://github.com/gcxfd) |
| Haskell | [libmdbx-hs](https://hackage.haskell.org/package/libmdbx) | [Francisco Vallarino](https://github.com/fjvallarino) |
| Lua | [lua-libmdbx](https://github.com/mah0x211/lua-libmdbx) | [Masatoshi Fukunaga](https://github.com/mah0x211) |
| NodeJS, [Deno](https://deno.land/) | [lmdbx-js](https://github.com/kriszyp/lmdbx-js) | [Kris Zyp](https://github.com/kriszyp/)
| NodeJS | [node-mdbx](https://www.npmjs.com/package/node-mdbx/) | [Сергей Федотов](mailto:sergey.fedotov@corp.mail.ru) |
| Nim | [NimDBX](https://github.com/snej/nimdbx) | [Jens Alfke](https://github.com/snej)
<!-- section-end -->
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
<!-- section-begin performance -->
Performance comparison Performance comparison
====================== ======================
Over the past 10 years, _libmdbx_ has had a lot of significant All benchmarks were done in 2015 by [IOArena](https://github.com/pmwkaa/ioarena)
improvements and innovations. _libmdbx_ has become a slightly faster in
simple cases and many times faster in complex scenarios, especially with
a huge transactions in gigantic databases. Therefore, on the one hand,
the results below are outdated. However, on the other hand, these simple
benchmarks are evident, easy to reproduce, and are close to the most
common use cases.
The following all benchmark results were obtained in 2015 by [IOArena](https://abf.io/erthink/ioarena)
and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015) and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015)
runs on my laptop (i7-4600U 2.1 GHz, SSD MZNTD512HAGL-000L1). runs on Lenovo Carbon-2 laptop, i7-4600U 2.1 GHz (2 physical cores, 4 HyperThreading cores), 8 Gb RAM,
SSD SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Gb.
## Integral performance ## Integral performance
Here showed sum of performance metrics in 3 benchmarks: Here showed sum of performance metrics in 3 benchmarks:
- Read/Search on the machine with 4 logical CPUs in HyperThreading mode (i.e. actually 2 physical CPU cores); - Read/Search on machine with 4 logical CPU in HyperThreading mode (i.e. actually 2 physical CPU cores);
- Transactions with [CRUD](https://en.wikipedia.org/wiki/CRUD) - Transactions with [CRUD](https://en.wikipedia.org/wiki/CRUD)
operations in sync-write mode (fdatasync is called after each operations in sync-write mode (fdatasync is called after each
@ -755,17 +479,16 @@ Here showed sum of performance metrics in 3 benchmarks:
2. Performance gap is too high to compare in any meaningful way. 2. Performance gap is too high to compare in any meaningful way.
![Comparison #1: Integral Performance](https://libmdbx.dqdkfa.ru/img/perf-slide-1.png) ![Comparison #1: Integral Performance](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/perf-slide-1.png)
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
## Read Scalability ## Read Scalability
Summary performance with concurrent read/search queries in 1-2-4-8 Summary performance with concurrent read/search queries in 1-2-4-8
threads on the machine with 4 logical CPUs in HyperThreading mode (i.e. threads on 4 CPU cores machine.
actually 2 physical CPU cores).
![Comparison #2: Read Scalability](https://libmdbx.dqdkfa.ru/img/perf-slide-2.png) ![Comparison #2: Read Scalability](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/perf-slide-2.png)
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
@ -779,15 +502,15 @@ actually 2 physical CPU cores).
execution time, cross marks standard deviation. execution time, cross marks standard deviation.
**10,000 transactions in sync-write mode**. In case of a crash all data **10,000 transactions in sync-write mode**. In case of a crash all data
is consistent and conforms to the last successful transaction. The is consistent and state is right after last successful transaction.
[fdatasync](https://linux.die.net/man/2/fdatasync) syscall is used after [fdatasync](https://linux.die.net/man/2/fdatasync) syscall is used after
each write transaction in this mode. each write transaction in this mode.
In the benchmark each transaction contains combined CRUD operations (2 In the benchmark each transaction contains combined CRUD operations (2
inserts, 1 read, 1 update, 1 delete). Benchmark starts on an empty database inserts, 1 read, 1 update, 1 delete). Benchmark starts on empty database
and after full run the database contains 10,000 small key-value records. and after full run the database contains 10,000 small key-value records.
![Comparison #3: Sync-write mode](https://libmdbx.dqdkfa.ru/img/perf-slide-3.png) ![Comparison #3: Sync-write mode](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/perf-slide-3.png)
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
@ -801,20 +524,20 @@ and after full run the database contains 10,000 small key-value records.
execution time, cross marks standard deviation. execution time, cross marks standard deviation.
**100,000 transactions in lazy-write mode**. In case of a crash all data **100,000 transactions in lazy-write mode**. In case of a crash all data
is consistent and conforms to the one of last successful transactions, but is consistent and state is right after one of last transactions, but
transactions after it will be lost. Other DB engines use transactions after it will be lost. Other DB engines use
[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) or transaction [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) or transaction
journal for that, which in turn depends on order of operations in the journal for that, which in turn depends on order of operations in
journaled filesystem. _libmdbx_ doesn't use WAL and hands I/O operations journaled filesystem. _libmdbx_ doesn't use WAL and hands I/O operations
to filesystem and OS kernel (mmap). to filesystem and OS kernel (mmap).
In the benchmark each transaction contains combined CRUD operations (2 In the benchmark each transaction contains combined CRUD operations (2
inserts, 1 read, 1 update, 1 delete). Benchmark starts on an empty database inserts, 1 read, 1 update, 1 delete). Benchmark starts on empty database
and after full run the database contains 100,000 small key-value and after full run the database contains 100,000 small key-value
records. records.
![Comparison #4: Lazy-write mode](https://libmdbx.dqdkfa.ru/img/perf-slide-4.png) ![Comparison #4: Lazy-write mode](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/perf-slide-4.png)
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
@ -827,18 +550,18 @@ records.
execution time of transactions. Each interval shows minimal and maximum execution time of transactions. Each interval shows minimal and maximum
execution time, cross marks standard deviation. execution time, cross marks standard deviation.
**1,000,000 transactions in async-write mode**. **1,000,000 transactions in async-write mode**. In case of a crash all
In case of a crash all data is consistent and conforms to the one of data will be consistent and state will be right after one of last
last successful transactions, but lost transaction count is much higher transactions, but lost transaction count is much higher than in
than in lazy-write mode. All DB engines in this mode do as little writes lazy-write mode. All DB engines in this mode do as little writes as
as possible on persistent storage. _libmdbx_ uses possible on persistent storage. _libmdbx_ uses
[msync(MS_ASYNC)](https://linux.die.net/man/2/msync) in this mode. [msync(MS_ASYNC)](https://linux.die.net/man/2/msync) in this mode.
In the benchmark each transaction contains combined CRUD operations (2 In the benchmark each transaction contains combined CRUD operations (2
inserts, 1 read, 1 update, 1 delete). Benchmark starts on an empty database inserts, 1 read, 1 update, 1 delete). Benchmark starts on empty database
and after full run the database contains 10,000 small key-value records. and after full run the database contains 10,000 small key-value records.
![Comparison #5: Async-write mode](https://libmdbx.dqdkfa.ru/img/perf-slide-5.png) ![Comparison #5: Async-write mode](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/perf-slide-5.png)
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
@ -860,8 +583,10 @@ which prevents to meaningfully compare it with them.
All benchmark data is gathered by All benchmark data is gathered by
[getrusage()](http://man7.org/linux/man-pages/man2/getrusage.2.html) [getrusage()](http://man7.org/linux/man-pages/man2/getrusage.2.html)
syscall and by scanning the data directory. syscall and by scanning data directory.
![Comparison #6: Cost comparison](https://libmdbx.dqdkfa.ru/img/perf-slide-6.png) ![Comparison #6: Cost comparison](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/perf-slide-6.png)
<!-- section-end --> --------------------------------------------------------------------------------
#### This is a mirror of the origin repository that was moved to [abf.io](https://abf.io/erthink/) because of discriminatory restrictions for Russian Crimea.

44
TODO.md
View File

@ -1,44 +0,0 @@
TODO
----
- [SWIG](https://www.swig.org/).
- Параллельная lto-сборка с устранением предупреждений.
- Интеграция c DTrace и аналогами.
- Новый стиль обработки ошибок с записью "трассы" и причин.
- Формирование отладочной информации посредством gdb.
- Поддержка WASM.
- Ранняя/не-отложенная очистка GC.
- Явная и автоматические уплотнение/дефрагментация.
- Нелинейная обработка GC.
- Перевести курсоры на двусвязный список вместо односвязного.
- Внутри `txn_renew()` вынести проверку когерентности mmap за/после изменение размера.
- [Migration guide from LMDB to MDBX](https://libmdbx.dqdkfa.ru/dead-github/issues/199).
- [Support for RAW devices](https://libmdbx.dqdkfa.ru/dead-github/issues/124).
- [Support MessagePack for Keys & Values](https://libmdbx.dqdkfa.ru/dead-github/issues/115).
- Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc.
Done
----
- Рефакторинг gc-get/gc-put c переходом на "интервальные" списки.
- [Engage new terminology](https://libmdbx.dqdkfa.ru/dead-github/issues/137).
- [More flexible support of asynchronous runtime/framework(s)](https://libmdbx.dqdkfa.ru/dead-github/issues/200).
- [Move most of `mdbx_chk` functional to the library API](https://libmdbx.dqdkfa.ru/dead-github/issues/204).
- [Simple careful mode for working with corrupted DB](https://libmdbx.dqdkfa.ru/dead-github/issues/223).
- [Engage an "overlapped I/O" on Windows](https://libmdbx.dqdkfa.ru/dead-github/issues/224).
- [Large/Overflow pages accounting for dirty-room](https://libmdbx.dqdkfa.ru/dead-github/issues/192).
- [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://libmdbx.dqdkfa.ru/dead-github/issues/193).
Cancelled
--------
- [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOSTICKYTHREADS` option](https://libmdbx.dqdkfa.ru/dead-github/issues/210).
Доработка не может быть реализована, так как замена SRW-блокировки
лишает лишь предварительную проблему, но не главную. На Windows
уменьшение размера отображенного в память файла не поддерживается ядром
ОС. Для этого необходимо снять отображение, изменить размер файла и
затем отобразить обратно. В свою очередь, для это необходимо
приостановить работающие с БД потоки выполняющие транзакции чтения, либо
готовые к такому выполнению. Но в режиме MDBX_NOSTICKYTHREADS нет
возможности отслеживать работающие с БД потоки, а приостановка всех
потоков неприемлема для большинства приложений.

99
appveyor.yml Normal file
View File

@ -0,0 +1,99 @@
version: 0.8.0.{build}
environment:
matrix:
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
CMAKE_GENERATOR: Visual Studio 16 2019
TOOLSET: 142
MDBX_BUILD_SHARED_LIBRARY: OFF
MDBX_AVOID_CRT: OFF
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
CMAKE_GENERATOR: Visual Studio 16 2019
TOOLSET: 142
MDBX_BUILD_SHARED_LIBRARY: ON
MDBX_AVOID_CRT: ON
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
CMAKE_GENERATOR: Visual Studio 16 2019
TOOLSET: 142
MDBX_BUILD_SHARED_LIBRARY: OFF
MDBX_AVOID_CRT: ON
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
CMAKE_GENERATOR: Visual Studio 16 2019
TOOLSET: 142
MDBX_BUILD_SHARED_LIBRARY: ON
MDBX_AVOID_CRT: OFF
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
CMAKE_GENERATOR: Visual Studio 15 2017
TOOLSET: 141
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
CMAKE_GENERATOR: Visual Studio 14 2015
TOOLSET: 140
branches:
except:
- coverity_scan
configuration:
- Debug
- Release
platform:
- Win32
- x64
before_build:
- git clean -x -f -d
- git submodule sync
- git fetch --tags --prune
- git submodule update --init --recursive
- git submodule foreach --recursive git fetch --tags --prune
- cmake --version
build_script:
- ps: |
Write-Output "*******************************************************************************"
Write-Output "Configuration: $env:CONFIGURATION"
Write-Output "Platform: $env:PLATFORM"
Write-Output "Toolchain: $env:CMAKE_GENERATOR v$env:TOOLSET"
Write-Output "Options: MDBX_AVOID_CRT=$env:MDBX_AVOID_CRT MDBX_BUILD_SHARED_LIBRARY=$env:MDBX_BUILD_SHARED_LIBRARY"
Write-Output "*******************************************************************************"
md _build -Force | Out-Null
cd _build
$generator = $env:CMAKE_GENERATOR
if ($env:TOOLSET -lt 142) {
if ($env:PLATFORM -eq "x64") {
$generator = "$generator Win64"
}
& cmake -G "$generator" -D CMAKE_CONFIGURATION_TYPES="Debug;Release" -D MDBX_AVOID_CRT:BOOL=$env:MDBX_AVOID_CRT -D MDBX_BUILD_SHARED_LIBRARY:BOOL=$env:MDBX_BUILD_SHARED_LIBRARY ..
} else {
& cmake -G "$generator" -A $env:PLATFORM -D CMAKE_CONFIGURATION_TYPES="Debug;Release" -DMDBX_AVOID_CRT:BOOL=$env:MDBX_AVOID_CRT -D MDBX_BUILD_SHARED_LIBRARY:BOOL=$env:MDBX_BUILD_SHARED_LIBRARY ..
}
if ($LastExitCode -ne 0) {
throw "Exec: $ErrorMessage"
}
Write-Output "*******************************************************************************"
& cmake --build . --config $env:CONFIGURATION
if ($LastExitCode -ne 0) {
throw "Exec: $ErrorMessage"
}
Write-Output "*******************************************************************************"
test_script:
- ps: |
if (($env:PLATFORM -ne "ARM") -and ($env:PLATFORM -ne "ARM64")) {
& ./$env:CONFIGURATION/mdbx_test.exe --progress --console=no --pathname=test.db --dont-cleanup-after basic > test.log
Get-Content test.log | Select-Object -last 42
if ($LastExitCode -ne 0) {
throw "Exec: $ErrorMessage"
} else {
& ./$env:CONFIGURATION/mdbx_chk.exe -nvv test.db | Tee-Object -file chk.log | Select-Object -last 42
}
}
on_failure:
- ps: Push-AppveyorArtifact \projects\libmdbx\_build\test.log
- ps: Push-AppveyorArtifact \projects\libmdbx\_build\test.db
- ps: Push-AppveyorArtifact \projects\libmdbx\_build\chk.log

File diff suppressed because it is too large Load Diff

View File

@ -1,58 +1,45 @@
# Copyright (c) 2012-2025 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> ############################################### ## Copyright (c) 2012-2020 Leonid Yuriev <leo@yuriev.ru>.
# SPDX-License-Identifier: Apache-2.0 ##
## Licensed under the Apache License, Version 2.0 (the "License");
if(CMAKE_VERSION VERSION_LESS 3.8.2) ## you may not use this file except in compliance with the License.
cmake_minimum_required(VERSION 3.0.2) ## You may obtain a copy of the License at
elseif(CMAKE_VERSION VERSION_LESS 3.12) ##
cmake_minimum_required(VERSION 3.8.2) ## http://www.apache.org/licenses/LICENSE-2.0
else() ##
cmake_minimum_required(VERSION 3.12) ## Unless required by applicable law or agreed to in writing, software
endif() ## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
cmake_minimum_required(VERSION 3.8.2)
cmake_policy(PUSH) cmake_policy(PUSH)
cmake_policy(VERSION ${CMAKE_MINIMUM_REQUIRED_VERSION}) cmake_policy(VERSION 3.8.2)
unset(MEMCHECK_OPTION_NAME)
if(NOT DEFINED ENABLE_MEMCHECK)
if(DEFINED MDBX_USE_VALGRIND)
set(MEMCHECK_OPTION_NAME "MDBX_USE_VALGRIND")
elseif(DEFINED ENABLE_VALGRIND)
set(MEMCHECK_OPTION_NAME "ENABLE_VALGRIND")
else()
set(MEMCHECK_OPTION_NAME "ENABLE_MEMCHECK")
endif()
if(MEMCHECK_OPTION_NAME STREQUAL "ENABLE_MEMCHECK")
option(ENABLE_MEMCHECK "Enable integration with valgrind, a memory analyzing tool" OFF)
elseif(${MEMCHECK_OPTION_NAME})
set(ENABLE_MEMCHECK ON)
else()
set(ENABLE_MEMCHECK OFF)
endif()
endif()
include(CheckLibraryExists) include(CheckLibraryExists)
check_library_exists(gcov __gcov_flush "" HAVE_GCOV) check_library_exists(gcov __gcov_flush "" HAVE_GCOV)
option(ENABLE_GCOV "Enable integration with gcov, a code coverage program" OFF) option(ENABLE_GCOV
"Enable integration with gcov, a code coverage program" OFF)
option(ENABLE_GPROF "Enable integration with gprof, a performance analyzing tool" OFF) option(ENABLE_GPROF
"Enable integration with gprof, a performance analyzing tool" OFF)
option(ENABLE_ASAN "Enable AddressSanitizer, a fast memory error detector based on compiler instrumentation" OFF) if(CMAKE_CXX_COMPILER_LOADED)
include(CheckIncludeFileCXX)
option(ENABLE_UBSAN check_include_file_cxx(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H)
"Enable UndefinedBehaviorSanitizer, a fast undefined behavior detector based on compiler instrumentation" OFF) else()
include(CheckIncludeFile)
if(ENABLE_MEMCHECK) check_include_file(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H)
if(CMAKE_CXX_COMPILER_LOADED)
include(CheckIncludeFileCXX)
check_include_file_cxx(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H)
else()
include(CheckIncludeFile)
check_include_file(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H)
endif()
if(NOT HAVE_VALGRIND_MEMCHECK_H)
message(FATAL_ERROR "${MEMCHECK_OPTION_NAME} option is set but valgrind/memcheck.h is not found")
endif()
endif() endif()
option(MDBX_USE_VALGRIND "Enable integration with valgrind, a memory analyzing tool" OFF)
if(MDBX_USE_VALGRIND AND NOT HAVE_VALGRIND_MEMCHECK_H)
message(FATAL_ERROR "MDBX_USE_VALGRIND option is set but valgrind/memcheck.h is not found")
endif()
option(ENABLE_ASAN
"Enable AddressSanitizer, a fast memory error detector based on compiler instrumentation" OFF)
cmake_policy(POP) cmake_policy(POP)

View File

@ -1,34 +1,24 @@
# Copyright (c) 2012-2025 Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> ############################################### ## Copyright (c) 2012-2020 Leonid Yuriev <leo@yuriev.ru>.
# SPDX-License-Identifier: Apache-2.0 ##
## Licensed under the Apache License, Version 2.0 (the "License");
if(CMAKE_VERSION VERSION_LESS 3.8.2) ## you may not use this file except in compliance with the License.
cmake_minimum_required(VERSION 3.0.2) ## You may obtain a copy of the License at
elseif(CMAKE_VERSION VERSION_LESS 3.12) ##
cmake_minimum_required(VERSION 3.8.2) ## http://www.apache.org/licenses/LICENSE-2.0
else() ##
cmake_minimum_required(VERSION 3.12) ## Unless required by applicable law or agreed to in writing, software
endif() ## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
cmake_minimum_required(VERSION 3.8.2)
cmake_policy(PUSH) cmake_policy(PUSH)
cmake_policy(VERSION ${CMAKE_MINIMUM_REQUIRED_VERSION}) cmake_policy(VERSION 3.8.2)
macro(add_option HIVE NAME DESCRIPTION DEFAULT) macro(add_compile_flags langs)
list(APPEND ${HIVE}_BUILD_OPTIONS ${HIVE}_${NAME}) foreach(_lang ${langs})
if(NOT ${DEFAULT} STREQUAL "AUTO")
option(${HIVE}_${NAME} "${DESCRIPTION}" ${DEFAULT})
elseif(NOT DEFINED ${HIVE}_${NAME})
set(${HIVE}_${NAME}_AUTO ON)
endif()
endmacro()
macro(set_if_undefined VARNAME)
if(NOT DEFINED "${VARNAME}")
set("${VARNAME}" ${ARGN})
endif()
endmacro()
macro(add_compile_flags languages)
foreach(_lang ${languages})
string(REPLACE ";" " " _flags "${ARGN}") string(REPLACE ";" " " _flags "${ARGN}")
if(CMAKE_CXX_COMPILER_LOADED AND _lang STREQUAL "CXX") if(CMAKE_CXX_COMPILER_LOADED AND _lang STREQUAL "CXX")
set("${_lang}_FLAGS" "${${_lang}_FLAGS} ${_flags}") set("${_lang}_FLAGS" "${${_lang}_FLAGS} ${_flags}")
@ -41,31 +31,15 @@ macro(add_compile_flags languages)
unset(_flags) unset(_flags)
endmacro(add_compile_flags) endmacro(add_compile_flags)
macro(remove_flag varname flag)
string(REGEX REPLACE "^(.*)( ${flag} )(.*)$" "\\1 \\3" ${varname} ${${varname}})
string(REGEX REPLACE "^((.+ )*)(${flag})(( .+)*)$" "\\1\\4" ${varname} ${${varname}})
endmacro(remove_flag)
macro(remove_compile_flag languages flag)
foreach(_lang ${languages})
if(CMAKE_CXX_COMPILER_LOADED AND _lang STREQUAL "CXX")
remove_flag(${_lang}_FLAGS ${flag})
endif()
if(CMAKE_C_COMPILER_LOADED AND _lang STREQUAL "C")
remove_flag(${_lang}_FLAGS ${flag})
endif()
endforeach()
unset(_lang)
endmacro(remove_compile_flag)
macro(set_source_files_compile_flags) macro(set_source_files_compile_flags)
foreach(file ${ARGN}) foreach(file ${ARGN})
get_filename_component(_file_ext ${file} EXT) get_filename_component(_file_ext ${file} EXT)
set(_lang "") set(_lang "")
if("${_file_ext}" STREQUAL ".m") if("${_file_ext}" STREQUAL ".m")
set(_lang OBJC) set(_lang OBJC)
# CMake believes that Objective C is a flavor of C++, not C, and uses g++ compiler for .m files. LANGUAGE property # CMake believes that Objective C is a flavor of C++, not C,
# forces CMake to use CC for ${file} # and uses g++ compiler for .m files.
# LANGUAGE property forces CMake to use CC for ${file}
set_source_files_properties(${file} PROPERTIES LANGUAGE C) set_source_files_properties(${file} PROPERTIES LANGUAGE C)
elseif("${_file_ext}" STREQUAL ".mm") elseif("${_file_ext}" STREQUAL ".mm")
set(_lang OBJCXX) set(_lang OBJCXX)
@ -79,446 +53,136 @@ macro(set_source_files_compile_flags)
set(_flags "${_flags} ${CMAKE_${_lang}_FLAGS}") set(_flags "${_flags} ${CMAKE_${_lang}_FLAGS}")
endif() endif()
# message(STATUS "Set (${file} ${_flags}") # message(STATUS "Set (${file} ${_flags}")
set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS "${_flags}") set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS
"${_flags}")
endif() endif()
endforeach() endforeach()
unset(_file_ext) unset(_file_ext)
unset(_lang) unset(_lang)
endmacro(set_source_files_compile_flags) endmacro(set_source_files_compile_flags)
macro(semver_parse str) macro(fetch_version name source_root_directory parent_scope)
set(_semver_ok FALSE) set(${name}_VERSION "")
set(_semver_err "") set(${name}_GIT_DESCRIBE "")
set(_semver_major 0) set(${name}_GIT_TIMESTAMP "")
set(_semver_minor 0) set(${name}_GIT_TREE "")
set(_semver_patch 0) set(${name}_GIT_COMMIT "")
set(_semver_tweak_withdot "") set(${name}_GIT_REVISION 0)
set(_semver_tweak "") set(${name}_GIT_VERSION "")
set(_semver_extra "") if(GIT AND EXISTS "${source_root_directory}/.git")
set(_semver_prerelease_withdash "") execute_process(COMMAND ${GIT} describe --tags --long --dirty=-dirty
set(_semver_prerelease "") OUTPUT_VARIABLE ${name}_GIT_DESCRIBE
set(_semver_buildmetadata_withplus "")
set(_semver_buildmetadata "")
if("${str}" MATCHES
"^v?(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))?([-+]-*[0-9a-zA-Z]+.*)?$")
set(_semver_major ${CMAKE_MATCH_1})
set(_semver_minor ${CMAKE_MATCH_2})
set(_semver_patch ${CMAKE_MATCH_3})
set(_semver_tweak_withdot ${CMAKE_MATCH_4})
set(_semver_tweak ${CMAKE_MATCH_5})
set(_semver_extra "${CMAKE_MATCH_6}")
if("${_semver_extra}" STREQUAL "")
set(_semver_ok TRUE)
elseif("${_semver_extra}" MATCHES "^([.-][a-zA-Z0-9-]+)*(\\+[^+]+)?$")
set(_semver_prerelease_withdash "${CMAKE_MATCH_1}")
if(NOT "${_semver_prerelease_withdash}" STREQUAL "")
string(SUBSTRING "${_semver_prerelease_withdash}" 1 -1 _semver_prerelease)
endif()
set(_semver_buildmetadata_withplus "${CMAKE_MATCH_2}")
if(NOT "${_semver_buildmetadata_withplus}" STREQUAL "")
string(SUBSTRING "${_semver_buildmetadata_withplus}" 1 -1 _semver_buildmetadata)
endif()
set(_semver_ok TRUE)
else()
set(_semver_err
"Поля prerelease и/или buildmetadata (строка `-foo+bar` в составе `0.0.0[.0][-foo][+bar]`) не соответствуют SemVer-спецификации"
)
endif()
else()
set(_semver_err "Версионная отметка в целом не соответствует шаблону `0.0.0[.0][-foo][+bar]` SemVer-спецификации")
endif()
endmacro(semver_parse)
function(_semver_parse_probe str expect)
semver_parse(${str})
if(expect AND NOT _semver_ok)
message(FATAL_ERROR "semver_parse(${str}) expect SUCCESS, got ${_semver_ok}: ${_semver_err}")
elseif(NOT expect AND _semver_ok)
message(FATAL_ERROR "semver_parse(${str}) expect FAIL, got ${_semver_ok}")
endif()
endfunction()
function(semver_parse_selfcheck)
_semver_parse_probe("0.0.4" TRUE)
_semver_parse_probe("v1.2.3" TRUE)
_semver_parse_probe("10.20.30" TRUE)
_semver_parse_probe("10.20.30.42" TRUE)
_semver_parse_probe("1.1.2-prerelease+meta" TRUE)
_semver_parse_probe("1.1.2+meta" TRUE)
_semver_parse_probe("1.1.2+meta-valid" TRUE)
_semver_parse_probe("1.0.0-alpha" TRUE)
_semver_parse_probe("1.0.0-beta" TRUE)
_semver_parse_probe("1.0.0-alpha.beta" TRUE)
_semver_parse_probe("1.0.0-alpha.beta.1" TRUE)
_semver_parse_probe("1.0.0-alpha.1" TRUE)
_semver_parse_probe("1.0.0-alpha0.valid" TRUE)
_semver_parse_probe("1.0.0-alpha.0valid" TRUE)
_semver_parse_probe("1.0.0-alpha-a.b-c-somethinglong+build.1-aef.1-its-okay" TRUE)
_semver_parse_probe("1.0.0-rc.1+build.1" TRUE)
_semver_parse_probe("2.0.0-rc.1+build.123" TRUE)
_semver_parse_probe("1.2.3-beta" TRUE)
_semver_parse_probe("10.2.3-DEV-SNAPSHOT" TRUE)
_semver_parse_probe("1.2.3-SNAPSHOT-123" TRUE)
_semver_parse_probe("1.0.0" TRUE)
_semver_parse_probe("2.0.0" TRUE)
_semver_parse_probe("1.1.7" TRUE)
_semver_parse_probe("2.0.0+build.1848" TRUE)
_semver_parse_probe("2.0.1-alpha.1227" TRUE)
_semver_parse_probe("1.0.0-alpha+beta" TRUE)
_semver_parse_probe("1.2.3----RC-SNAPSHOT.12.9.1--.12+788" TRUE)
_semver_parse_probe("1.2.3----R-S.12.9.1--.12+meta" TRUE)
_semver_parse_probe("1.2.3----RC-SNAPSHOT.12.9.1--.12" TRUE)
_semver_parse_probe("1.0.0+0.build.1-rc.10000aaa-kk-0.1" TRUE)
_semver_parse_probe("99999999999999999999999.999999999999999999.99999999999999999" TRUE)
_semver_parse_probe("v1.0.0-0A.is.legal" TRUE)
_semver_parse_probe("1" FALSE)
_semver_parse_probe("1.2" FALSE)
# _semver_parse_probe("1.2.3-0123" FALSE) _semver_parse_probe("1.2.3-0123.0123" FALSE)
_semver_parse_probe("1.1.2+.123" FALSE)
_semver_parse_probe("+invalid" FALSE)
_semver_parse_probe("-invalid" FALSE)
_semver_parse_probe("-invalid+invalid" FALSE)
_semver_parse_probe("-invalid.01" FALSE)
_semver_parse_probe("alpha" FALSE)
_semver_parse_probe("alpha.beta" FALSE)
_semver_parse_probe("alpha.beta.1" FALSE)
_semver_parse_probe("alpha.1" FALSE)
_semver_parse_probe("alpha+beta" FALSE)
_semver_parse_probe("alpha_beta" FALSE)
_semver_parse_probe("alpha." FALSE)
_semver_parse_probe("alpha.." FALSE)
_semver_parse_probe("beta" FALSE)
_semver_parse_probe("1.0.0-alpha_beta" FALSE)
_semver_parse_probe("-alpha." FALSE)
_semver_parse_probe("1.0.0-alpha.." FALSE)
_semver_parse_probe("1.0.0-alpha..1" FALSE)
_semver_parse_probe("1.0.0-alpha...1" FALSE)
_semver_parse_probe("1.0.0-alpha....1" FALSE)
_semver_parse_probe("1.0.0-alpha.....1" FALSE)
_semver_parse_probe("1.0.0-alpha......1" FALSE)
_semver_parse_probe("1.0.0-alpha.......1" FALSE)
_semver_parse_probe("01.1.1" FALSE)
_semver_parse_probe("1.01.1" FALSE)
_semver_parse_probe("1.1.01" FALSE)
_semver_parse_probe("1.2" FALSE)
_semver_parse_probe("1.2.3.DEV" FALSE)
_semver_parse_probe("1.2-SNAPSHOT" FALSE)
_semver_parse_probe("1.2.31.2.3----RC-SNAPSHOT.12.09.1--..12+788" FALSE)
_semver_parse_probe("1.2-RC-SNAPSHOT" FALSE)
_semver_parse_probe("-1.0.3-gamma+b7718" FALSE)
_semver_parse_probe("+justmeta" FALSE)
_semver_parse_probe("9.8.7+meta+meta" FALSE)
_semver_parse_probe("9.8.7-whatever+meta+meta" FALSE)
_semver_parse_probe(
"99999999999999999999999.999999999999999999.99999999999999999----RC-SNAPSHOT.12.09.1--------------------------------..12"
FALSE)
endfunction()
macro(git_get_versioninfo source_root_directory)
set(_git_describe "")
set(_git_timestamp "")
set(_git_tree "")
set(_git_commit "")
set(_git_last_vtag "")
set(_git_trailing_commits 0)
set(_git_is_dirty FALSE)
execute_process(
COMMAND ${GIT} show --no-patch --format=%cI HEAD
OUTPUT_VARIABLE _git_timestamp
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc OR "${_git_timestamp}" STREQUAL "%cI")
execute_process(
COMMAND ${GIT} show --no-patch --format=%ci HEAD
OUTPUT_VARIABLE _git_timestamp
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory} WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc) RESULT_VARIABLE rc)
if(_rc OR "${_git_timestamp}" STREQUAL "%ci") if(rc OR "${name}_GIT_DESCRIBE" STREQUAL "")
message(FATAL_ERROR "Please install latest version of git (`show --no-patch --format=%cI HEAD` failed)") message(FATAL_ERROR "Please fetch tags and/or install latest version of git ('describe --tags --long --dirty' failed)")
endif() endif()
endif()
execute_process( execute_process(COMMAND ${GIT} show --no-patch --format=%cI HEAD
COMMAND ${GIT} show --no-patch --format=%T HEAD OUTPUT_VARIABLE ${name}_GIT_TIMESTAMP
OUTPUT_VARIABLE _git_tree
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc OR "${_git_tree}" STREQUAL "")
message(FATAL_ERROR "Please install latest version of git (`show --no-patch --format=%T HEAD` failed)")
endif()
execute_process(
COMMAND ${GIT} show --no-patch --format=%H HEAD
OUTPUT_VARIABLE _git_commit
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc OR "${_git_commit}" STREQUAL "")
message(FATAL_ERROR "Please install latest version of git (`show --no-patch --format=%H HEAD` failed)")
endif()
execute_process(
COMMAND ${GIT} status --untracked-files=no --porcelain
OUTPUT_VARIABLE _git_status
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc)
message(FATAL_ERROR "Please install latest version of git (`status --untracked-files=no --porcelain` failed)")
endif()
if(NOT "${_git_status}" STREQUAL "")
set(_git_commit "DIRTY-${_git_commit}")
set(_git_is_dirty TRUE)
endif()
unset(_git_status)
execute_process(
COMMAND ${GIT} describe --tags --abbrev=0 "--match=v[0-9]*"
OUTPUT_VARIABLE _git_last_vtag
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc OR "${_git_last_vtag}" STREQUAL "")
execute_process(
COMMAND ${GIT} tag
OUTPUT_VARIABLE _git_tags_dump
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory} WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc) RESULT_VARIABLE rc)
execute_process( if(rc OR "${name}_GIT_TIMESTAMP" STREQUAL "%cI")
COMMAND ${GIT} rev-list --count --no-merges --remove-empty HEAD execute_process(COMMAND ${GIT} show --no-patch --format=%ci HEAD
OUTPUT_VARIABLE _git_whole_count OUTPUT_VARIABLE ${name}_GIT_TIMESTAMP
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc)
message(
FATAL_ERROR
"Please install latest version of git (`git rev-list --count --no-merges --remove-empty HEAD` failed)")
endif()
if(_git_whole_count GREATER 42 AND "${_git_tags_dump}" STREQUAL "")
message(FATAL_ERROR "Please fetch tags (`describe --tags --abbrev=0 --match=v[0-9]*` failed)")
else()
message(NOTICE "Falling back to version `0.0.0` (have you made an initial release?")
endif()
set(_git_last_vtag "0.0.0")
set(_git_trailing_commits ${_git_whole_count})
execute_process(
COMMAND ${GIT} describe --tags --dirty --long --always
OUTPUT_VARIABLE _git_describe
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc OR "${_git_describe}" STREQUAL "")
execute_process(
COMMAND ${GIT} describe --tags --all --dirty --long --always
OUTPUT_VARIABLE _git_describe
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory} WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc) RESULT_VARIABLE rc)
if(_rc OR "${_git_describe}" STREQUAL "") if(rc OR "${name}_GIT_TIMESTAMP" STREQUAL "%ci")
message(FATAL_ERROR "Please install latest version of git (`describe --tags --all --long` failed)") message(FATAL_ERROR "Please install latest version of git ('show --no-patch --format=%cI HEAD' failed)")
endif() endif()
endif() endif()
else()
execute_process(
COMMAND ${GIT} describe --tags --dirty --long "--match=v[0-9]*"
OUTPUT_VARIABLE _git_describe
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc OR "${_git_describe}" STREQUAL "")
message(FATAL_ERROR "Please install latest version of git (`describe --tags --long --match=v[0-9]*`)")
endif()
execute_process(
COMMAND ${GIT} rev-list --count "${_git_last_vtag}..HEAD"
OUTPUT_VARIABLE _git_trailing_commits
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc)
if(_rc OR "${_git_trailing_commits}" STREQUAL "")
message(FATAL_ERROR "Please install latest version of git (`rev-list --count ${_git_last_vtag}..HEAD` failed)")
endif()
endif()
endmacro(git_get_versioninfo)
macro(semver_provide name source_root_directory build_directory_for_json_output build_metadata parent_scope) execute_process(COMMAND ${GIT} show --no-patch --format=%T HEAD
set(_semver "") OUTPUT_VARIABLE ${name}_GIT_TREE
set(_git_describe "")
set(_git_timestamp "")
set(_git_tree "")
set(_git_commit "")
set(_version_from "")
set(_git_root FALSE)
find_program(GIT git)
if(GIT)
execute_process(
COMMAND ${GIT} rev-parse --show-toplevel
OUTPUT_VARIABLE _git_root
ERROR_VARIABLE _git_root_error
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory} WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE _rc) RESULT_VARIABLE rc)
if(_rc OR "${_git_root}" STREQUAL "") if(rc OR "${name}_GIT_TREE" STREQUAL "")
if(EXISTS "${source_root_directory}/.git") message(FATAL_ERROR "Please install latest version of git ('show --no-patch --format=%T HEAD' failed)")
message(ERROR "`git rev-parse --show-toplevel` failed '${_git_root_error}'") endif()
else()
message(VERBOSE "`git rev-parse --show-toplevel` failed '${_git_root_error}'") execute_process(COMMAND ${GIT} show --no-patch --format=%H HEAD
endif() OUTPUT_VARIABLE ${name}_GIT_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE rc)
if(rc OR "${name}_GIT_COMMIT" STREQUAL "")
message(FATAL_ERROR "Please install latest version of git ('show --no-patch --format=%H HEAD' failed)")
endif()
execute_process(COMMAND ${GIT} rev-list --count --no-merges HEAD
OUTPUT_VARIABLE ${name}_GIT_REVISION
OUTPUT_STRIP_TRAILING_WHITESPACE
WORKING_DIRECTORY ${source_root_directory}
RESULT_VARIABLE rc)
if(rc OR "${name}_GIT_REVISION" STREQUAL "")
message(FATAL_ERROR "Please install latest version of git ('rev-list --count --no-merges HEAD' failed)")
endif()
string(REGEX MATCH "^(v)?([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)?" git_version_valid "${${name}_GIT_DESCRIBE}")
if(git_version_valid)
string(REGEX REPLACE "^(v)?([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)?" "\\2;\\3;\\4" ${name}_GIT_VERSION ${${name}_GIT_DESCRIBE})
else() else()
set(_source_root "${source_root_directory}") string(REGEX MATCH "^(v)?([0-9]+)\\.([0-9]+)(.*)?" git_version_valid "${${name}_GIT_DESCRIBE}")
if(NOT CMAKE_VERSION VERSION_LESS 3.19) if(git_version_valid)
file(REAL_PATH "${_git_root}" _git_root) string(REGEX REPLACE "^(v)?([0-9]+)\\.([0-9]+)(.*)?" "\\2;\\3;0" ${name}_GIT_VERSION ${${name}_GIT_DESCRIBE})
file(REAL_PATH "${_source_root}" _source_root) else()
endif() message(AUTHOR_WARNING "Bad ${name} version \"${${name}_GIT_DESCRIBE}\"; falling back to 0.0.0 (have you made an initial release?)")
if(_source_root STREQUAL _git_root AND EXISTS "${_git_root}/VERSION.json") set(${name}_GIT_VERSION "0;0;0")
message(
FATAL_ERROR
"Несколько источников информации о версии, допустим только один из: репозиторий git, либо файл VERSION.json"
)
endif() endif()
endif() endif()
endif() endif()
if(EXISTS "${source_root_directory}/VERSION.json") if(NOT ${name}_GIT_VERSION OR NOT ${name}_GIT_TIMESTAMP OR NOT ${name}_GIT_REVISION)
set(_version_from "${source_root_directory}/VERSION.json") if(GIT AND EXISTS "${source_root_directory}/.git")
message(WARNING "Unable to retrive ${name} version from git.")
endif()
set(${name}_GIT_VERSION "0;0;0;0")
set(${name}_GIT_TIMESTAMP "")
set(${name}_GIT_REVISION 0)
if(CMAKE_VERSION VERSION_LESS 3.19) # Try to get version from VERSION file
message(FATAL_ERROR "Требуется CMake версии >= 3.19 для чтения VERSION.json") set(version_file "${source_root_directory}/VERSION")
endif() if(EXISTS "${version_file}")
file( file(STRINGS "${version_file}" ${name}_VERSION LIMIT_COUNT 1 LIMIT_INPUT 42)
STRINGS "${_version_from}" _versioninfo_json NEWLINE_CONSUME endif()
LIMIT_COUNT 9
LIMIT_INPUT 999 if(NOT ${name}_VERSION)
ENCODING UTF-8) message(WARNING "Unable to retrive ${name} version from \"${version_file}\" file.")
string(JSON _git_describe GET ${_versioninfo_json} git_describe) set(${name}_VERSION_LIST ${${name}_GIT_VERSION})
string(JSON _git_timestamp GET "${_versioninfo_json}" "git_timestamp") string(REPLACE ";" "." ${name}_VERSION "${${name}_GIT_VERSION}")
string(JSON _git_tree GET "${_versioninfo_json}" "git_tree") else()
string(JSON _git_commit GET "${_versioninfo_json}" "git_commit") string(REPLACE "." ";" ${name}_VERSION_LIST ${${name}_VERSION})
string(JSON _semver GET "${_versioninfo_json}" "semver")
unset(_json_object)
if(NOT _semver)
message(FATAL_ERROR "Unable to retrieve ${name} version from \"${_version_from}\" file.")
endif()
semver_parse("${_semver}")
if(NOT _semver_ok)
message(FATAL_ERROR "SemVer `${_semver}` from ${_version_from}: ${_semver_err}")
endif()
elseif(_git_root AND _source_root STREQUAL _git_root)
set(_version_from git)
git_get_versioninfo(${source_root_directory})
semver_parse(${_git_last_vtag})
if(NOT _semver_ok)
message(FATAL_ERROR "Git tag `${_git_last_vtag}`: ${_semver_err}")
endif()
if(_git_trailing_commits GREATER 0 AND "${_semver_tweak}" STREQUAL "")
set(_semver_tweak ${_git_trailing_commits})
endif() endif()
elseif(GIT)
message(
FATAL_ERROR
"Нет источника информации о версии (${source_root_directory}), требуется один из: репозиторий git, либо VERSION.json"
)
else() else()
message(FATAL_ERROR "Требуется git для получения информации о версии") list(APPEND ${name}_GIT_VERSION ${${name}_GIT_REVISION})
set(${name}_VERSION_LIST ${${name}_GIT_VERSION})
string(REPLACE ";" "." ${name}_VERSION "${${name}_GIT_VERSION}")
endif() endif()
if(NOT _git_describe list(GET ${name}_VERSION_LIST 0 "${name}_VERSION_MAJOR")
OR NOT _git_timestamp list(GET ${name}_VERSION_LIST 1 "${name}_VERSION_MINOR")
OR NOT _git_tree list(GET ${name}_VERSION_LIST 2 "${name}_VERSION_RELEASE")
OR NOT _git_commit list(GET ${name}_VERSION_LIST 3 "${name}_VERSION_REVISION")
OR "${_semver_major}" STREQUAL ""
OR "${_semver_minor}" STREQUAL ""
OR "${_semver_patch}" STREQUAL "")
message(ERROR "Unable to retrieve ${name} version from ${_version_from}.")
endif()
set(_semver "${_semver_major}.${_semver_minor}.${_semver_patch}")
if("${_semver_tweak}" STREQUAL "")
set(_semver_tweak 0)
elseif(_semver_tweak GREATER 0)
string(APPEND _semver ".${_semver_tweak}")
endif()
if(NOT "${_semver_prerelease}" STREQUAL "")
string(APPEND _semver "-${_semver_prerelease}")
endif()
if(_git_is_dirty)
string(APPEND _semver "-DIRTY")
endif()
set(_semver_complete "${_semver}")
if(NOT "${build_metadata}" STREQUAL "")
string(APPEND _semver_complete "+${build_metadata}")
endif()
set(${name}_VERSION "${_semver_complete}")
set(${name}_VERSION_PURE "${_semver}")
set(${name}_VERSION_MAJOR ${_semver_major})
set(${name}_VERSION_MINOR ${_semver_minor})
set(${name}_VERSION_PATCH ${_semver_patch})
set(${name}_VERSION_TWEAK ${_semver_tweak})
set(${name}_VERSION_PRERELEASE "${_semver_prerelease}")
set(${name}_GIT_DESCRIBE "${_git_describe}")
set(${name}_GIT_TIMESTAMP "${_git_timestamp}")
set(${name}_GIT_TREE "${_git_tree}")
set(${name}_GIT_COMMIT "${_git_commit}")
if(${parent_scope}) if(${parent_scope})
set(${name}_VERSION set(${name}_VERSION_MAJOR "${${name}_VERSION_MAJOR}" PARENT_SCOPE)
"${_semver_complete}" set(${name}_VERSION_MINOR "${${name}_VERSION_MINOR}" PARENT_SCOPE)
PARENT_SCOPE) set(${name}_VERSION_RELEASE "${${name}_VERSION_RELEASE}" PARENT_SCOPE)
set(${name}_VERSION_PURE set(${name}_VERSION_REVISION "${${name}_VERSION_REVISION}" PARENT_SCOPE)
"${_semver}" set(${name}_VERSION "${${name}_VERSION}" PARENT_SCOPE)
PARENT_SCOPE)
set(${name}_VERSION_MAJOR
${_semver_major}
PARENT_SCOPE)
set(${name}_VERSION_MINOR
${_semver_minor}
PARENT_SCOPE)
set(${name}_VERSION_PATCH
${_semver_patch}
PARENT_SCOPE)
set(${name}_VERSION_TWEAK
"${_semver_tweak}"
PARENT_SCOPE)
set(${name}_VERSION_PRERELEASE
"${_semver_prerelease}"
PARENT_SCOPE)
set(${name}_GIT_DESCRIBE
"${_git_describe}"
PARENT_SCOPE)
set(${name}_GIT_TIMESTAMP
"${_git_timestamp}"
PARENT_SCOPE)
set(${name}_GIT_TREE
"${_git_tree}"
PARENT_SCOPE)
set(${name}_GIT_COMMIT
"${_git_commit}"
PARENT_SCOPE)
endif()
if(_version_from STREQUAL "git") set(${name}_GIT_DESCRIBE "${${name}_GIT_DESCRIBE}" PARENT_SCOPE)
string( set(${name}_GIT_TIMESTAMP "${${name}_GIT_TIMESTAMP}" PARENT_SCOPE)
CONFIGURE set(${name}_GIT_TREE "${${name}_GIT_TREE}" PARENT_SCOPE)
"{ set(${name}_GIT_COMMIT "${${name}_GIT_COMMIT}" PARENT_SCOPE)
\"git_describe\" : \"@_git_describe@\", set(${name}_GIT_REVISION "${${name}_GIT_REVISION}" PARENT_SCOPE)
\"git_timestamp\" : \"@_git_timestamp@\", set(${name}_GIT_VERSION "${${name}_GIT_VERSION}" PARENT_SCOPE)
\"git_tree\" : \"@_git_tree@\",
\"git_commit\" : \"@_git_commit@\",
\"semver\" : \"@_semver@\"\n}"
_versioninfo_json
@ONLY ESCAPE_QUOTES)
file(WRITE "${build_directory_for_json_output}/VERSION.json" "${_versioninfo_json}")
endif() endif()
endmacro(semver_provide) endmacro(fetch_version)
cmake_policy(POP) cmake_policy(POP)

View File

@ -1,323 +0,0 @@
import shutil
import json
import os
import re
import subprocess
from conan.tools.files import rm
from conan.tools.scm import Git
from conan.tools.apple import is_apple_os
from conan.tools.cmake import CMakeToolchain, CMake, cmake_layout, CMakeDeps
from conan import ConanFile
required_conan_version = '>=2.7'
def semver_parse(s):
m = re.match('^v?(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\.(?P<patch>0|[1-9]\d*)(\\.(?P<tweak>0|[1-9]\d*))?(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$', s)
return m.groupdict() if m else None
def semver_string(semver):
s = str(semver['major']) + '.' + \
str(semver['minor']) + '.' + str(semver['patch'])
if not semver['tweak'] is None and semver['tweak'] != 0:
s += '.' + str(semver['tweak'])
if not semver['prerelease'] is None and semver['prerelease'] != '':
s += '-' + semver['prerelease']
return s
def semver_string_with_buildmetadata(semver):
s = semver_string(semver)
if not semver['buildmetadata'] is None and semver['buildmetadata'] != '':
s += '+' + semver['buildmetadata']
return s
class libmdbx(ConanFile):
name = 'mdbx'
package_type = 'library'
description = 'One of the fastest embeddable key-value ACID database without WAL. libmdbx surpasses the legendary LMDB in terms of reliability, features and performance.'
license = 'Apache-2.0'
author = 'Leo Yuriev <leo@yuriev.ru>'
homepage = 'https://libmdbx.dqdkfa.ru'
url = 'https://gitflic.ru/project/erthink/libmdbx.git'
topics = ('embedded-database', 'key-value', 'btree', 'LMDB', 'storage-engine',
'data-storage', 'nosql', 'ACID', 'MVCC', 'MDBX')
no_copy_source = True
test_type = 'explicit'
build_policy = 'missing'
revision_mode = 'scm'
languages = 'C', 'C++'
provides = 'libmdbx'
implements = ['auto_shared_fpic']
# upload_policy = 'skip'
# exports_sources = 'LICENSE', 'NOTICE', 'CMakeLists.txt', '*.h', '*.h++', '*.c', '*.c++', 'ntdll.def', 'man1/*', 'cmake/*', 'config.h.in'
settings = 'os', 'compiler', 'build_type', 'arch'
options = {
'mdbx.64bit_atomic': ['Auto', True, False],
'mdbx.64bit_cas': ['Auto', True, False],
'mdbx.apple.speed_insteadof_durability': ['Default', True, False],
'mdbx.avoid_msync': ['Auto', True, False],
'mdbx.build_cxx': ['Default', True, False],
'mdbx.build_tools': ['Default', True, False],
'mdbx.cacheline_size': ['Auto', 16, 32, 64, 128, 256],
'mdbx.disable_validation': ['Default', True, False],
'mdbx.enable_bigfoot': ['Default', True, False],
'mdbx.enable_dbi_lockfree': ['Default', True, False],
'mdbx.enable_dbi_sparse': ['Default', True, False],
'mdbx.enable_pgop_stat': ['Default', True, False],
'mdbx.enable_profgc': ['Default', True, False],
'mdbx.enable_refund': ['Default', True, False],
'mdbx.env_checkpid': ['Default', True, False],
'mdbx.force_assertions': ['Default', True, False],
'mdbx.have_builtin_cpu_supports': ['Auto', True, False],
'mdbx.locking': ['Auto', 'WindowsFileLocking', 'SystemV', 'POSIX1988', 'POSIX2001', 'POSIX2008'],
'mdbx.mmap_incoherent_file_write': ['Auto', True, False],
'mdbx.mmap_needs_jolt': ['Auto', True, False],
'mdbx.trust_rtc': ['Default', True, False],
'mdbx.txn_checkowner': ['Default', True, False],
'mdbx.unaligned_ok': ['Auto', True, False],
'mdbx.use_copyfilerange': ['Auto', True, False],
'mdbx.use_mincore': ['Auto', True, False],
'mdbx.use_ofdlocks': ['Auto', True, False],
'mdbx.use_sendfile': ['Auto', True, False],
'mdbx.without_msvc_crt': ['Default', True, False],
'shared': [True, False],
}
default_options = {
'mdbx.64bit_atomic': 'Auto',
'mdbx.64bit_cas': 'Auto',
'mdbx.apple.speed_insteadof_durability': 'Default',
'mdbx.avoid_msync': 'Auto',
'mdbx.build_cxx': 'Default',
'mdbx.build_tools': 'Default',
'mdbx.cacheline_size': 'Auto',
'mdbx.disable_validation': 'Default',
'mdbx.enable_bigfoot': 'Default',
'mdbx.enable_dbi_lockfree': 'Default',
'mdbx.enable_dbi_sparse': 'Default',
'mdbx.enable_pgop_stat': 'Default',
'mdbx.enable_profgc': 'Default',
'mdbx.enable_refund': 'Default',
'mdbx.env_checkpid': 'Default',
'mdbx.force_assertions': 'Default',
'mdbx.have_builtin_cpu_supports': 'Auto',
'mdbx.locking': 'Auto',
'mdbx.mmap_incoherent_file_write': 'Auto',
'mdbx.mmap_needs_jolt': 'Auto',
'mdbx.trust_rtc': 'Default',
'mdbx.txn_checkowner': 'Default',
'mdbx.unaligned_ok': 'Auto',
'mdbx.use_copyfilerange': 'Auto',
'mdbx.use_mincore': 'Auto',
'mdbx.use_ofdlocks': 'Auto',
'mdbx.use_sendfile': 'Auto',
'mdbx.without_msvc_crt': 'Default',
'shared': True,
}
options_description = {
'mdbx.64bit_atomic': 'Advanced: Assume 64-bit operations are atomic and not splitted to 32-bit halves. ',
'mdbx.64bit_cas': 'Advanced: Assume 64-bit atomic compare-and-swap operation is available. ',
'mdbx.apple.speed_insteadof_durability': 'Disable using `fcntl(F_FULLFSYNC)` for a performance reasons at the cost of durability on power failure. ',
'mdbx.avoid_msync': 'Disable in-memory database updating with consequent flush-to-disk/msync syscall in `MDBX_WRITEMAP` mode. ',
'mdbx.build_cxx': 'Build C++ portion. ',
'mdbx.build_tools': 'Build CLI tools (mdbx_chk/stat/dump/load/copy/drop). ',
'mdbx.cacheline_size': 'Advanced: CPU cache line size for data alignment to avoid cache line false-sharing. ',
'mdbx.disable_validation': 'Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB. ',
'mdbx.enable_bigfoot': 'Chunking long list of retired pages during huge transactions commit to avoid use sequences of pages. ',
'mdbx.enable_dbi_lockfree': 'Support for deferred releasing and a lockfree path to quickly open DBI handles. ',
'mdbx.enable_dbi_sparse': 'Support for sparse sets of DBI handles to reduce overhead when starting and processing transactions. ',
'mdbx.enable_pgop_stat': 'Gathering statistics for page operations. ',
'mdbx.enable_profgc': 'Profiling of GC search and updates. ',
'mdbx.enable_refund': 'Online database zero-cost auto-compactification during write-transactions. ',
'mdbx.env_checkpid': "Checking PID inside libmdbx's API against reuse database environment after the `fork()`. ",
'mdbx.force_assertions': 'Forces assertion checking even for release builds. ',
'mdbx.have_builtin_cpu_supports': 'Advanced: Assume the compiler and target system has `__builtin_cpu_supports()`. ',
'mdbx.locking': 'Advanced: Choices the locking implementation. ',
'mdbx.mmap_incoherent_file_write': "Advanced: Assume system don't have unified page cache and/or file write operations incoherent with memory-mapped files. ",
'mdbx.mmap_needs_jolt': 'Advanced: Assume system needs explicit syscall to sync/flush/write modified mapped memory. ',
'mdbx.trust_rtc': 'Advanced: Does a system have battery-backed Real-Time Clock or just a fake. ',
'mdbx.txn_checkowner': 'Checking transaction owner thread against misuse transactions from other threads. ',
'mdbx.unaligned_ok': 'Advanced: Assume a target CPU and/or the compiler support unaligned access. ',
'mdbx.use_copyfilerange': 'Advanced: Use `copy_file_range()` syscall. ',
'mdbx.use_mincore': "Use Unix' `mincore()` to determine whether database pages are resident in memory. ",
'mdbx.use_ofdlocks': 'Advanced: Use POSIX OFD-locks. ',
'mdbx.use_sendfile': 'Advancedc: Use `sendfile()` syscall. ',
'mdbx.without_msvc_crt': 'Avoid dependence from MSVC CRT and use ntdll.dll instead. ',
}
build_metadata = None
def config_options(self):
if self.settings.get_safe('os') != 'Linux':
self.options.rm_safe('mdbx.use_copyfilerange')
self.options.rm_safe('mdbx.use_sendfile')
if self.settings.get_safe('os') == 'Windows':
self.default_options['mdbx.avoid_msync'] = True
self.options.rm_safe('mdbx.env_checkpid')
self.options.rm_safe('mdbx.locking')
self.options.rm_safe('mdbx.mmap_incoherent_file_write')
self.options.rm_safe('mdbx.use_mincore')
self.options.rm_safe('mdbx.use_ofdlocks')
else:
self.options.rm_safe('mdbx.without_msvc_crt')
if is_apple_os(self):
self.options.rm_safe('mdbx.mmap_incoherent_file_write')
else:
self.options.rm_safe('mdbx.apple.speed_insteadof_durability')
def fetch_versioninfo_from_git(self):
git = Git(self, folder=self.recipe_folder)
git_timestamp = git.run('show --no-patch --format=%cI HEAD')
git_tree = git.run('show --no-patch --format=%T HEAD')
git_commit = git.run('show --no-patch --format=%H HEAD')
if git.run('rev-list --tags --count') == 0:
git.run('fetch --tags')
git_last_vtag = git.run('describe --tags --abbrev=0 --match=v[0-9]*')
if git_last_vtag == '':
git_describe = git.run('describe --all --long --always')
git_semver = semver_parse(
'0.0.0.' + git.run('rev-list --count --remove-empty --no-merges HEAD'))
else:
git_describe = git.run('describe --tags --long --match=v[0-9]*')
git_version = '.'.join(
map(str, re.split('[-v.]+', git.run('describe --tags --match=v[0-9]*'))[1:5]))
git_semver = semver_parse(git_last_vtag)
if git_semver['prerelease'] is None or git_semver['prerelease'] == '':
git_since_vtag = git.run(
'rev-list ' + git_last_vtag + '.. --count')
if int(git_since_vtag) > 0:
git_semver['tweak'] = int(git_since_vtag)
else:
git_semver['tweak'] = None
info = {'git_describe': git_describe, 'git_timestamp': git_timestamp,
'git_tree': git_tree, 'git_commit': git_commit, 'semver': semver_string(git_semver)}
return info
def export_sources(self):
subprocess.run(['make', '-C', self.recipe_folder, 'DIST_DIR=' +
self.export_sources_folder, '@dist-checked.tag'], check=True)
rm(self, 'Makefile', self.export_sources_folder)
rm(self, 'GNUmakefile', self.export_sources_folder)
# json.dump(self.fetch_versioninfo_from_git(), open(os.path.join(
# self.export_sources_folder, 'VERSION.json'), 'w', encoding='utf-8'))
def source(self):
version_json_pathname = os.path.join(
self.export_sources_folder, 'VERSION.json')
version_json = json.load(
open(os.path.join(version_json_pathname), encoding='utf-8'))['semver']
if version_json != semver_string(semver_parse(self.version)):
self.output.error('Package/Recipe version "' + self.version +
'" mismatch VERSION.json "' + version_json + '"')
def set_version(self):
if self.build_metadata is None and not self.version is None:
self.build_metadata = self.version
semver = semver_parse(self.build_metadata)
if semver:
self.build_metadata = semver['buildmetadata']
else:
self.build_metadata = re.match(
'^[^0-9a-zA-Z]*([0-9a-zA-Z]+[-.0-9a-zA-Z]*)', self.build_metadata).group(1)
if self.build_metadata is None:
self.build_metadata = ''
version_json_pathname = os.path.join(
self.recipe_folder, 'VERSION.json')
if os.path.exists(version_json_pathname):
self.version = json.load(
open(version_json_pathname, encoding='utf-8'))['semver']
version_from = "'" + version_json_pathname + "'"
else:
self.version = self.fetch_versioninfo_from_git()['semver']
version_from = 'Git'
self.output.verbose('Fetch version from ' +
version_from + ': ' + self.version)
if self.build_metadata != '':
self.version += '+' + self.build_metadata
def layout(self):
cmake_layout(self)
def handle_option(self, tc, name, define=False):
opt = self.options.get_safe(name)
if not opt is None:
value = str(opt).lower()
if value != 'auto' and value != 'default':
name = name.upper().replace('.', '_')
if define:
if value == 'false' or value == 'no' or value == 'off':
tc.preprocessor_definitions[name] = 0
elif value == 'true' or value == 'yes' or value == 'on':
tc.preprocessor_definitions[name] = 1
else:
tc.preprocessor_definitions[name] = int(opt)
self.output.highlight(
name + '=' + str(tc.preprocessor_definitions[name]) + ' (' + str(opt) + ')')
else:
tc.cache_variables[name] = opt
self.output.highlight(
name + '=' + str(tc.cache_variables[name]) + ' (' + str(opt) + ')')
def generate(self):
tc = CMakeToolchain(self)
if self.build_metadata is None:
self.build_metadata = semver_parse(self.version)['buildmetadata']
if not self.build_metadata is None and self.build_metadata != '':
tc.variables['MDBX_BUILD_METADATA'] = self.build_metadata
self.output.highlight('MDBX_BUILD_METADATA is ' +
str(tc.variables['MDBX_BUILD_METADATA']))
self.handle_option(tc, 'mdbx.64bit_atomic', True)
self.handle_option(tc, 'mdbx.64bit_cas', True)
self.handle_option(tc, 'mdbx.apple.speed_insteadof_durability')
self.handle_option(tc, 'mdbx.avoid_msync')
self.handle_option(tc, 'mdbx.build_tools')
self.handle_option(tc, 'mdbx.build_cxx')
self.handle_option(tc, 'mdbx.cacheline_size', True)
self.handle_option(tc, 'mdbx.disable_validation')
self.handle_option(tc, 'mdbx.enable_bigfoot')
self.handle_option(tc, 'mdbx.enable_dbi_lockfree')
self.handle_option(tc, 'mdbx.enable_dbi_sparse')
self.handle_option(tc, 'mdbx.enable_pgop_stat')
self.handle_option(tc, 'mdbx.enable_profgc')
self.handle_option(tc, 'mdbx.enable_refund')
self.handle_option(tc, 'mdbx.env_checkpid')
self.handle_option(tc, 'mdbx.force_assertions')
self.handle_option(tc, 'mdbx.have_builtin_cpu_supports', True)
self.handle_option(tc, 'mdbx.mmap_incoherent_file_write', True)
self.handle_option(tc, 'mdbx.mmap_needs_jolt')
self.handle_option(tc, 'mdbx.trust_rtc')
self.handle_option(tc, 'mdbx.txn_checkowner')
self.handle_option(tc, 'mdbx.unaligned_ok', True)
self.handle_option(tc, 'mdbx.use_copyfilerange', True)
self.handle_option(tc, 'mdbx.use_mincore')
self.handle_option(tc, 'mdbx.use_ofdlocks')
self.handle_option(tc, 'mdbx.use_sendfile', True)
self.handle_option(tc, 'mdbx.without_msvc_crt')
opt = self.options.get_safe('mdbx.locking', 'auto')
if not opt is None:
value = str(opt).lower()
if value != 'auto' and value != 'default':
map = {'windowsfilelocking': -1, 'systemv': 5, 'posix1988': 1988,
'posix2001': 2001, 'posix2008': 2008}
value = map[value]
tc.cache_variables['MDBX_LOCKING'] = value
self.output.highlight('MDBX_LOCKING=' +
str(tc.cache_variables['MDBX_LOCKING']))
tc.generate()
def build(self):
cmake = CMake(self)
cmake.configure()
cmake.build()
def package(self):
cmake = CMake(self)
cmake.install()
def package_info(self):
if self.options.shared:
self.cpp_info.libs = ['mdbx']
else:
self.cpp_info.libs = ['mdbx-static']

File diff suppressed because it is too large Load Diff

View File

@ -1,47 +0,0 @@
\page intro Introduction
\section characteristics Characteristics
Preface {#preface}
------------------
> For the most part, this section is a copy of the corresponding text
> from LMDB description, but with some edits reflecting the improvements
> and enhancements were made in MDBX.
MDBX is a Btree-based database management library modeled loosely on the
BerkeleyDB API, but much simplified. The entire database (aka "environment")
is exposed in a memory map, and all data fetches return data directly from
the mapped memory, so no malloc's or memcpy's occur during data fetches.
As such, the library is extremely simple because it requires no page caching
layer of its own, and it is extremely high performance and memory-efficient.
It is also fully transactional with full ACID semantics, and when the memory
map is read-only, the database integrity cannot be corrupted by stray pointer
writes from application code.
The library is fully thread-aware and supports concurrent read/write access
from multiple processes and threads. Data pages use a copy-on-write strategy
so no active data pages are ever overwritten, which also provides resistance
to corruption and eliminates the need of any special recovery procedures
after a system crash. Writes are fully serialized; only one write transaction
may be active at a time, which guarantees that writers can never deadlock.
The database structure is multi-versioned so readers run with no locks;
writers cannot block readers, and readers don't block writers.
Unlike other well-known database mechanisms which use either write-ahead
transaction logs or append-only data writes, MDBX requires no maintenance
during operation. Both write-ahead loggers and append-only databases require
periodic checkpointing and/or compaction of their log or database files
otherwise they grow without bound. MDBX tracks retired/freed pages within the
database and re-uses them for new write operations, so the database size does
not grow without bound in normal use. It is worth noting that the "next"
version libmdbx (\ref MithrilDB) will solve this problem.
The memory map can be used as a read-only or read-write map. It is read-only
by default as this provides total immunity to corruption. Using read-write
mode offers much higher write performance, but adds the possibility for stray
application writes thru pointers to silently corrupt the database.
Of course if your application code is known to be bug-free (...) then this is
not an issue.
If this is your first time using a transactional embedded key-value store,
you may find the \ref starting section below to be helpful.

View File

@ -1,248 +0,0 @@
Restrictions & Caveats {#restrictions}
======================
In addition to those listed for some functions.
## Long-lived read transactions {#long-lived-read}
Avoid long-lived read transactions, especially in the scenarios with a
high rate of write transactions. Long-lived read transactions prevents
recycling pages retired/freed by newer write transactions, thus the
database can grow quickly.
Understanding the problem of long-lived read transactions requires some
explanation, but can be difficult for quick perception. So is is
reasonable to simplify this as follows:
1. Garbage collection problem exists in all databases one way or
another, e.g. VACUUM in PostgreSQL. But in MDBX it's even more
discernible because of high transaction rate and intentional
internals simplification in favor of performance.
2. MDBX employs [Multiversion concurrency control](https://en.wikipedia.org/wiki/Multiversion_concurrency_control)
on the [Copy-on-Write](https://en.wikipedia.org/wiki/Copy-on-write)
basis, that allows multiple readers runs in parallel with a write
transaction without blocking. An each write transaction needs free
pages to put the changed data, that pages will be placed in the new
b-tree snapshot at commit. MDBX efficiently recycling pages from
previous created unused snapshots, BUT this is impossible if anyone
a read transaction use such snapshot.
3. Thus massive altering of data during a parallel long read operation
will increase the process's work set and may exhaust entire free
database space.
A good example of long readers is a hot backup to the slow destination
or debugging of a client application while retaining an active read
transaction. LMDB this results in `MDB_MAP_FULL` error and subsequent write
performance degradation.
MDBX mostly solve "long-lived" readers issue by offering to use a
transaction parking-and-ousting approach by \ref mdbx_txn_park(),
Handle-Slow-Readers \ref MDBX_hsr_func callback which allows to abort
long-lived read transactions, and using the \ref MDBX_LIFORECLAIM mode
which addresses subsequent performance degradation. The "next" version
of libmdbx (aka \ref MithrilDB) will completely solve this.
Nonetheless, situations that encourage lengthy read transactions while
intensively updating data should be avoided. For example, you should
avoid suspending/blocking processes/threads performing read
transactions, including during debugging, and use transaction parking if
necessary.
You should also beware of aborting processes that perform reading
transactions. Despite the fact that libmdbx automatically checks and
cleans readers, as an a process aborting (especially with core dump) can
take a long time, and checking readers cannot be performed too often due
to performance degradation.
This issue will be addressed in MithrilDB and one of libmdbx releases,
presumably in 2025. To do this, nonlinear GC recycling will be
implemented, without stopping garbage recycling on the old MVCC snapshot
used by a long read transaction.
After the planned implementation, any long-term reading transaction will
still keep the used MVCC-snapshot (all the database pages forming it)
from being recycled, but it will allow all unused MVCC snapshots to be
recycled, both before and after the readable one. This will eliminate
one of the main architectural flaws inherited from LMDB and caused the
growth of a database in proportion to a volume of data changes made
concurrently with a long-running read transaction.
## Large data items
MDBX allows you to store values up to 1 gigabyte in size, but this is
not the main functionality for a key-value storage, but an additional
feature that should not be abused. Such long values are stored in
consecutive/adjacent DB pages, which has both pros and cons. This allows
you to read long values directly without copying and without any
overhead from a linear section of memory.
On the other hand, when putting such values in the database, it is
required to find a sufficient number of free consecutive/adjacent
database pages, which can be very difficult and expensive, moreover
sometimes impossible since b-tree tends to fragmentation. So, when
placing very long values, the engine may need to process the entire GC,
and in the absence of a sufficient sequence of free pages, increase the
DB file. Thus, for long values, MDBX provides maximum read performance
at the expense of write performance.
Some aspects related to GC have been refined and improved in 2022 within
the first releases of the 0.12.x series. In particular the search for
free consecutive/adjacent pages through GC has been significantly
speeded, including acceleration using NOEN/SSE2/AVX2/AVX512
instructions.
This issue will be addressed in MithrilDB and refined within one of
0.15.x libmdbx releases, presumably at end of 2025.
### Huge transactions
A similar situation can be with huge transactions, in which a lot of
database pages are retired. The retired pages should be put into GC as a
list of page numbers for future reuse. But in huge transactions, such a
list of retired page numbers can also be huge, i.e. it is a very long
value and requires a long sequence of free pages to be saved. Thus, if
you delete large amounts of information from the database in a single
transaction, MDBX may need to increase the database file to save the
list of pages to be retired.
This issue was fixed in 2022 within the first releases of the 0.12.x
series by `Big Foot` feature, which now is enabled by default.
See \ref MDBX_ENABLE_BIGFOOT build-time option.
The `Big Foot` feature which significantly reduces GC overhead for
processing large lists of retired pages from huge transactions. Now
libmdbx avoid creating large chunks of PNLs (page number lists) which
required a long sequences of free pages, aka large/overflow pages. Thus
avoiding searching, allocating and storing such sequences inside GC.
## Space reservation
An MDBX database configuration will often reserve considerable unused
memory address space and maybe file size for future growth. This does
not use actual memory or disk space, but users may need to understand
the difference so they won't be scared off.
However, on 64-bit systems with a relative small amount of RAM, such
reservation can deplete system resources (trigger ENOMEM error, etc)
when setting an inadequately large upper DB size using \ref
mdbx_env_set_geometry() or \ref mdbx::env::geometry. So just avoid this.
## Remote filesystems
Do not use MDBX databases on remote filesystems, even between processes
on the same host. This breaks file locks on some platforms, possibly
memory map sync, and certainly sync between programs on different hosts.
On the other hand, MDBX support the exclusive database operation over
a network, and cooperative read-only access to the database placed on
a read-only network shares.
## Child processes
Do not use opened \ref MDBX_env instance(s) in a child processes after `fork()`.
It would be insane to call fork() and any MDBX-functions simultaneously
from multiple threads. The best way is to prevent the presence of open
MDBX-instances during `fork()`.
The \ref MDBX_ENV_CHECKPID build-time option, which is ON by default on
non-Windows platforms (i.e. where `fork()` is available), enables PID
checking at a few critical points. But this does not give any guarantees,
but only allows you to detect such errors a little sooner. Depending on
the platform, you should expect an application crash and/or database
corruption in such cases.
On the other hand, MDBX allow calling \ref mdbx_env_close() in such cases to
release resources, but no more and in general this is a wrong way.
#### Since v0.13.1 and later
Starting from the v0.13.1 release, the \ref mdbx_env_resurrect_after_fork()
is available, which allows you to reuse an already open database
environment in child processes, but strictly without inheriting any
transactions from a parent process.
## Read-only mode
There is no pure read-only mode in a normal explicitly way, since
readers need write access to LCK-file to be ones visible for writer.
So MDBX always tries to open/create LCK-file for read-write, but switches
to without-LCK mode on appropriate errors (`EROFS`, `EACCESS`, `EPERM`)
if the read-only mode was requested by the \ref MDBX_RDONLY flag which is
described below.
## Troubleshooting the LCK-file
1. A broken LCK-file can cause sync issues, including appearance of
wrong/inconsistent data for readers. When database opened in the
cooperative read-write mode the LCK-file requires to be mapped to
memory in read-write access. In this case it is always possible for
stray/malfunctioned application could writes thru pointers to
silently corrupt the LCK-file.
Unfortunately, there is no any portable way to prevent such
corruption, since the LCK-file is updated concurrently by
multiple processes in a lock-free manner and any locking is
unwise due to a large overhead.
\note Workaround: Just make all programs using the database close it;
the LCK-file is always reset on first open.
2. Stale reader transactions left behind by an aborted program cause
further writes to grow the database quickly, and stale locks can
block further operation.
MDBX checks for stale readers while opening environment and before
growth the database. But in some cases, this may not be enough.
\note Workaround: Check for stale readers periodically, using the
\ref mdbx_reader_check() function or the mdbx_stat tool.
3. Stale writers will be cleared automatically by MDBX on supported
platforms. But this is platform-specific, especially of
implementation of shared POSIX-mutexes and support for robust
mutexes. For instance there are no known issues on Linux, OSX,
Windows and FreeBSD.
\note Workaround: Otherwise just make all programs using the database
close it; the LCK-file is always reset on first open of the environment.
## One thread - One transaction
A thread can only use one transaction at a time, plus any nested
read-write transactions in the non-writemap mode. Each transaction
belongs to one thread. The \ref MDBX_NOSTICKYTHREADS flag changes this,
see below.
Do not start more than one transaction for a one thread. If you think
about this, it's really strange to do something with two data snapshots
at once, which may be different. MDBX checks and preventing this by
returning corresponding error code (\ref MDBX_TXN_OVERLAPPING,
\ref MDBX_BAD_RSLOT, \ref MDBX_BUSY) unless you using
\ref MDBX_NOSTICKYTHREADS option on the environment.
Nonetheless, with the `MDBX_NOSTICKYTHREADS` option, you must know
exactly what you are doing, otherwise you will get deadlocks or reading
an alien data.
## Do not open twice
Do not have open an MDBX database twice in the same process at the same
time. By default MDBX prevent this in most cases by tracking databases
opening and return \ref MDBX_BUSY if anyone LCK-file is already open.
The reason for this is that when the "Open file description" locks (aka
OFD-locks) are not available, MDBX uses POSIX locks on files, and these
locks have issues if one process opens a file multiple times. If a single
process opens the same environment multiple times, closing it once will
remove all the locks held on it, and the other instances will be
vulnerable to corruption from other processes.
For compatibility with LMDB which allows multi-opening, MDBX can be
configured at runtime by `mdbx_setup_debug(MDBX_DBG_LEGACY_MULTIOPEN, ...)`
prior to calling other MDBX functions. In this way MDBX will track
databases opening, detect multi-opening cases and then recover POSIX file
locks as necessary. However, lock recovery can cause unexpected pauses,
such as when another process opened the database in exclusive mode before
the lock was restored - we have to wait until such a process releases the
database, and so on.

View File

@ -1,245 +0,0 @@
Getting started {#starting}
===============
> This section is based on Bert Hubert's intro "LMDB Semantics", with
> edits reflecting the improvements and enhancements were made in MDBX.
> See Bert Hubert's [original](https://github.com/ahupowerdns/ahutils/blob/master/lmdb-semantics.md).
Everything starts with an environment, created by \ref mdbx_env_create().
Once created, this environment must also be opened with \ref mdbx_env_open(),
and after use be closed by \ref mdbx_env_close(). At that a non-zero value
of the last argument "mode" supposes MDBX will create database and directory
if ones does not exist. In this case the non-zero "mode" argument specifies
the file mode bits be applied when a new files are created by `open()` function.
Within that directory, a lock file (aka LCK-file) and a storage file (aka
DXB-file) will be generated. If you don't want to use a directory, you can
pass the \ref MDBX_NOSUBDIR option, in which case the path you provided is used
directly as the DXB-file, and another file with a "-lck" suffix added
will be used for the LCK-file.
Once the environment is open, a transaction can be created within it using
\ref mdbx_txn_begin(). Transactions may be read-write or read-only, and read-write
transactions may be nested. A transaction must only be used by one thread at
a time. Transactions are always required, even for read-only access. The
transaction provides a consistent view of the data.
Once a transaction has been created, a database (i.e. key-value space inside
the environment) can be opened within it using \ref mdbx_dbi_open(). If only one
database will ever be used in the environment, a `NULL` can be passed as the
database name. For named databases, the \ref MDBX_CREATE flag must be used to
create the database if it doesn't already exist. Also, \ref mdbx_env_set_maxdbs()
must be called after \ref mdbx_env_create() and before \ref mdbx_env_open() to set
the maximum number of named databases you want to support.
\note A single transaction can open multiple databases. Generally databases
should only be opened once, by the first transaction in the process.
Within a transaction, \ref mdbx_get() and \ref mdbx_put() can store single key-value
pairs if that is all you need to do (but see \ref Cursors below if you want to do
more).
A key-value pair is expressed as two \ref MDBX_val structures. This struct that is
exactly similar to POSIX's `struct iovec` and has two fields, `iov_len` and
`iov_base`. The data is a `void` pointer to an array of `iov_len` bytes.
\note The notable difference between MDBX and LMDB is that MDBX support zero
length keys.
Because MDBX is very efficient (and usually zero-copy), the data returned in
an \ref MDBX_val structure may be memory-mapped straight from disk. In other words
look but do not touch (or `free()` for that matter). Once a transaction is
closed, the values can no longer be used, so make a copy if you need to keep
them after that.
## Cursors {#Cursors}
To do more powerful things, we must use a cursor.
Within the transaction, a cursor can be created with \ref mdbx_cursor_open().
With this cursor we can store/retrieve/delete (multiple) values using
\ref mdbx_cursor_get(), \ref mdbx_cursor_put() and \ref mdbx_cursor_del().
The \ref mdbx_cursor_get() positions itself depending on the cursor operation
requested, and for some operations, on the supplied key. For example, to list
all key-value pairs in a database, use operation \ref MDBX_FIRST for the first
call to \ref mdbx_cursor_get(), and \ref MDBX_NEXT on subsequent calls, until
the end is hit.
To retrieve all keys starting from a specified key value, use \ref MDBX_SET. For
more cursor operations, see the \ref c_api reference.
When using \ref mdbx_cursor_put(), either the function will position the cursor
for you based on the key, or you can use operation \ref MDBX_CURRENT to use the
current position of the cursor. \note Note that key must then match the current
position's key.
## Summarizing the opening
So we have a cursor in a transaction which opened a database in an
environment which is opened from a filesystem after it was separately
created.
Or, we create an environment, open it from a filesystem, create a transaction
within it, open a database within that transaction, and create a cursor
within all of the above.
Got it?
## Threads and processes
Do not have open an database twice in the same process at the same time, MDBX
will track and prevent this. Instead, share the MDBX environment that has
opened the file across all threads. The reason for this is:
- When the "Open file description" locks (aka OFD-locks) are not available,
MDBX uses POSIX locks on files, and these locks have issues if one process
opens a file multiple times.
- If a single process opens the same environment multiple times, closing it
once will remove all the locks held on it, and the other instances will be
vulnerable to corruption from other processes.
+ For compatibility with LMDB which allows multi-opening, MDBX can be
configured at runtime by \ref mdbx_setup_debug() with \ref MDBX_DBG_LEGACY_MULTIOPEN` option
prior to calling other MDBX functions. In this way MDBX will track
databases opening, detect multi-opening cases and then recover POSIX file
locks as necessary. However, lock recovery can cause unexpected pauses,
such as when another process opened the database in exclusive mode before
the lock was restored - we have to wait until such a process releases the
database, and so on.
Do not use opened MDBX environment(s) after `fork()` in a child process(es),
MDBX will check and prevent this at critical points. Instead, ensure there is
no open MDBX-instance(s) during fork(), or at least close it immediately after
`fork()` in the child process and reopen if required - for instance by using
`pthread_atfork()`. The reason for this is:
- For competitive consistent reading, MDBX assigns a slot in the shared
table for each process that interacts with the database. This slot is
populated with process attributes, including the PID.
- After `fork()`, in order to remain connected to a database, the child
process must have its own such "slot", which can't be assigned in any
simple and robust way another than the regular.
- A write transaction from a parent process cannot continue in a child
process for obvious reasons.
- Moreover, in a multithreaded process at the fork() moment any number of
threads could run in critical and/or intermediate sections of MDBX code
with interaction and/or racing conditions with threads from other
process(es). For instance: shrinking a database or copying it to a pipe,
opening or closing environment, beginning or finishing a transaction,
and so on.
= Therefore, any solution other than simply close database (and reopen if
necessary) in a child process would be both extreme complicated and so
fragile.
Do not start more than one transaction for a one thread. If you think
about this, it's really strange to do something with two data snapshots
at once, which may be different. MDBX checks and preventing this by
returning corresponding error code (\ref MDBX_TXN_OVERLAPPING,
\ref MDBX_BAD_RSLOT, \ref MDBX_BUSY) unless you using
\ref MDBX_NOSTICKYTHREADS option on the environment. Nonetheless,
with the \ref MDBX_NOSTICKYTHREADS option, you must know exactly what
you are doing, otherwise you will get deadlocks or reading an alien
data.
Also note that a transaction is tied to one thread by default using
Thread Local Storage. If you want to pass transactions across threads,
you can use the \ref MDBX_NOSTICKYTHREADS option on the environment.
Nevertheless, a write transaction must be committed or aborted in the
same thread which it was started. MDBX checks this in a reasonable
manner and return the \ref MDBX_THREAD_MISMATCH error in rules
violation.
## Transactions, rollbacks etc
To actually get anything done, a transaction must be committed using
\ref mdbx_txn_commit(). Alternatively, all of a transaction's operations
can be discarded using \ref mdbx_txn_abort().
\attention An important difference between MDBX and LMDB is that MDBX required
that any opened cursors can be reused and must be freed explicitly, regardless
ones was opened in a read-only or write transaction. The REASON for this is
eliminates ambiguity which helps to avoid errors such as: use-after-free,
double-free, i.e. memory corruption and segfaults.
For read-only transactions, obviously there is nothing to commit to storage.
\attention An another notable difference between MDBX and LMDB is that MDBX make
handles opened for existing databases immediately available for other
transactions, regardless this transaction will be aborted or reset. The
REASON for this is to avoiding the requirement for multiple opening a same
handles in concurrent read transactions, and tracking of such open but hidden
handles until the completion of read transactions which opened them.
In addition, as long as a transaction is open, a consistent view of the
database is kept alive, which requires storage. A read-only transaction that
no longer requires this consistent view should be terminated (committed or
aborted) when the view is no longer needed (but see below for an
optimization).
There can be multiple simultaneously active read-only transactions but only
one that can write. Once a single read-write transaction is opened, all
further attempts to begin one will block until the first one is committed or
aborted. This has no effect on read-only transactions, however, and they may
continue to be opened at any time.
## Duplicate keys aka Multi-values
\ref mdbx_get() and \ref mdbx_put() respectively have no and only some support or
multiple key-value pairs with identical keys. If there are multiple values
for a key, \ref mdbx_get() will only return the first value.
When multiple values for one key are required, pass the \ref MDBX_DUPSORT flag to
\ref mdbx_dbi_open(). In an \ref MDBX_DUPSORT database, by default \ref mdbx_put() will
not replace the value for a key if the key existed already. Instead it will add
the new value to the key. In addition, \ref mdbx_del() will pay attention to the
value field too, allowing for specific values of a key to be deleted.
Finally, additional cursor operations become available for traversing through
and retrieving duplicate values.
## Some optimization
If you frequently begin and abort read-only transactions, as an optimization,
it is possible to only reset and renew a transaction.
\ref mdbx_txn_reset() releases any old copies of data kept around for a read-only
transaction. To reuse this reset transaction, call \ref mdbx_txn_renew() on it.
Any cursors in this transaction can also be renewed using \ref mdbx_cursor_renew()
or freed by \ref mdbx_cursor_close().
To permanently free a transaction, reset or not, use \ref mdbx_txn_abort().
## Cleaning up
Any created cursors must be closed using \ref mdbx_cursor_close(). It is advisable
to repeat:
\note An important difference between MDBX and LMDB is that MDBX required that
any opened cursors can be reused and must be freed explicitly, regardless
ones was opened in a read-only or write transaction. The REASON for this is
eliminates ambiguity which helps to avoid errors such as: use-after-free,
double-free, i.e. memory corruption and segfaults.
It is very rarely necessary to close a database handle, and in general they
should just be left open. When you close a handle, it immediately becomes
unavailable for all transactions in the environment. Therefore, you should
avoid closing the handle while at least one transaction is using it.
## Now read up on the full API!
The full \ref c_api documentation lists further details below, like how to:
- Configure database size and automatic size management: \ref mdbx_env_set_geometry().
- Drop and clean a database: \ref mdbx_drop().
- Detect and report errors: \ref c_err.
- Optimize (bulk) loading speed: \ref MDBX_MULTIPLE, \ref MDBX_APPEND.
- Reduce (temporarily) robustness to gain even more speed: \ref sync_modes.
- Gather statistics about the database: \ref c_statinfo.
- Sstimate size of range query result: \ref c_rqest.
- Double performance by LIFO reclaiming on storages with write-back: \ref MDBX_LIFORECLAIM.
- Use sequences and canary markers: \ref mdbx_dbi_sequence(), \ref MDBX_canary.
- Use Handle-Slow-Readers callback to resolve a database full/overflow issues
due to long-lived read transactions: \ref mdbx_env_set_hsr().
- Use exclusive mode: \ref MDBX_EXCLUSIVE.
- Define custom sort orders (but this is recommended to be avoided).

View File

@ -1,37 +0,0 @@
The source code is availale on [Gitflic](https://gitflic.ru/project/erthink/libmdbx).
Donations are welcome to ETH `0xD104d8f8B2dC312aaD74899F83EBf3EEBDC1EA3A`.
Всё будет хорошо!
> Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx) (archive [1](https://libmdbx.dqdkfa.ru/tg-archive/messages1.html),
> [2](https://libmdbx.dqdkfa.ru/tg-archive/messages2.html), [3](https://libmdbx.dqdkfa.ru/tg-archive/messages3.html), [4](https://libmdbx.dqdkfa.ru/tg-archive/messages4.html),
> [5](https://libmdbx.dqdkfa.ru/tg-archive/messages5.html), [6](https://libmdbx.dqdkfa.ru/tg-archive/messages6.html), [7](https://libmdbx.dqdkfa.ru/tg-archive/messages7.html)).
> See the [ChangeLog](https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md) for `NEWS` and latest updates.
\section toc Table of Contents
This manual is divided into parts,
each of which is divided into several sections.
1. The \ref intro
- \ref characteristics
- \ref improvements
- \ref restrictions
- \ref performance
2. \ref usage
- \ref getting
- \ref starting
- \ref bindings
3. The `C/C++` API manual:
- The \ref c_api reference
- \ref c_crud_hints "Quick reference for Insert/Update/Delete operations"
- The \ref mdbx.h header file reference
- The \ref cxx_api reference
- The \ref mdbx.h++ header file reference
Please do not hesitate to point out errors in the documentation,
including creating [merge-request](https://gitflic.ru/project/erthink/libmdbx/merge-request) with corrections and improvements.
---
\section MithrilDB MithrilDB and Future

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.2 KiB

View File

@ -1,103 +0,0 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="$langISO">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=11"/>
<meta name="generator" content="Doxygen $doxygenversion"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<link rel="icon" href="favicon.ico">
<link rel="icon" href="img/bear.png" type="image/png">
<link rel="apple-touch-icon" href="img/bear.png">
<meta property="og:type" content="article"/>
<meta property="og:url" content="https://libmdbx.dqdkfa.ru/"/>
<meta name="twitter:title" content="One of the fastest embeddable key-value engine"/>
<meta name="twitter:description" content="MDBX surpasses the legendary LMDB in terms of reliability, features and performance. For now libmdbx is chosen by all modern Ethereum frontiers as a storage engine."/>
<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
<!--BEGIN PROJECT_ICON-->
<link rel="icon" href="$relpath^$projecticon" type="image/x-icon" />
<!--END PROJECT_ICON-->
<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
<!--BEGIN DISABLE_INDEX-->
<!--BEGIN FULL_SIDEBAR-->
<script type="text/javascript">var page_layout=1;</script>
<!--END FULL_SIDEBAR-->
<!--END DISABLE_INDEX-->
<script type="text/javascript" src="$relpath^jquery.js"></script>
<script type="text/javascript" src="$relpath^dynsections.js"></script>
<!--BEGIN COPY_CLIPBOARD-->
<script type="text/javascript" src="$relpath^clipboard.js"></script>
<!--END COPY_CLIPBOARD-->
$treeview
$search
$mathjax
$darkmode
<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
$extrastylesheet
</head>
<body>
<!--BEGIN DISABLE_INDEX-->
<!--BEGIN FULL_SIDEBAR-->
<div id="side-nav" class="ui-resizable side-nav-resizable"><!-- do not remove this div, it is closed by doxygen! -->
<!--END FULL_SIDEBAR-->
<!--END DISABLE_INDEX-->
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<!--BEGIN TITLEAREA-->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr id="projectrow">
<!--BEGIN PROJECT_LOGO-->
<td id="projectlogo"><img alt="Logo" src="$relpath^$projectlogo"$logosize/></td>
<!--END PROJECT_LOGO-->
<!--BEGIN PROJECT_NAME-->
<td id="projectalign">
<div id="projectname">$projectname<!--BEGIN PROJECT_NUMBER--><span id="projectnumber">&#160;$projectnumber</span><!--END PROJECT_NUMBER-->
</div>
<!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
</td>
<!--END PROJECT_NAME-->
<!--BEGIN !PROJECT_NAME-->
<!--BEGIN PROJECT_BRIEF-->
<td>
<div id="projectbrief">$projectbrief</div>
</td>
<!--END PROJECT_BRIEF-->
<!--END !PROJECT_NAME-->
<!--BEGIN DISABLE_INDEX-->
<!--BEGIN SEARCHENGINE-->
<!--BEGIN !FULL_SIDEBAR-->
<td>$searchbox</td>
<!--END !FULL_SIDEBAR-->
<!--END SEARCHENGINE-->
<!--END DISABLE_INDEX-->
</tr>
<!--BEGIN SEARCHENGINE-->
<!--BEGIN FULL_SIDEBAR-->
<tr><td colspan="2">$searchbox</td></tr>
<!--END FULL_SIDEBAR-->
<!--END SEARCHENGINE-->
</tbody>
</table>
</div>
<!--END TITLEAREA-->
<!-- Yandex.Metrika counter -->
<script type="text/javascript" >
(function(m,e,t,r,i,k,a){m[i]=m[i]||function(){(m[i].a=m[i].a||[]).push(arguments)};
m[i].l=1*new Date();
for (var j = 0; j < document.scripts.length; j++) {if (document.scripts[j].src === r) { return; }}
k=e.createElement(t),a=e.getElementsByTagName(t)[0],k.async=1,k.src=r,a.parentNode.insertBefore(k,a)})
(window, document, "script", "https://mc.yandex.ru/metrika/tag.js", "ym");
ym(99261645, "init", {
clickmap:true,
trackLinks:true,
accurateTrackBounce:true,
webvisor:true
});
</script>
<noscript><div><img src="https://mc.yandex.ru/watch/99261645" style="position:absolute; left:-9999px;" alt="" /></div></noscript>
<!-- /Yandex.Metrika counter -->
<!-- end header part -->

View File

@ -1,27 +0,0 @@
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "ItemList",
"itemListElement": [{
"@type": "ListItem",
"position": 1,
"name": "Группа в Telegram",
"url": "https://t.me/libmdbx"
},{
"@type": "ListItem",
"position": 2,
"name": "Исходный код",
"url": "https://gitflic.ru/project/erthink/libmdbx"
},{
"@type": "ListItem",
"position": 3,
"name": "C++ API",
"url": "https://libmdbx.dqdkfa.ru/group__cxx__api.html"
},{
"@type": "ListItem",
"position": 4,
"name": "Mirror on Github",
"url": "https://github.com/erthink/libmdbx"
}]
}
</script>

View File

@ -1,6 +0,0 @@
{
"icons": [
{ "src": "favicon.ico", "type": "image/ico", "sizes": "32x32" },
{ "src": "img/bear.png", "type": "image/png", "sizes": "256x256" }
]
}

View File

@ -1,2 +0,0 @@
<title>libmdbx: One of the fastest embeddable key-value engine</title>
<meta name="description" content="libmdbx surpasses the legendary LMDB in terms of reliability, features and performance. For now libmdbx is chosen by all modern Ethereum frontiers as a storage engine.">

View File

@ -1,10 +1,10 @@
/* MDBX usage example /* MDBX usage examle
* *
* Do a line-by-line comparison of this and sample-bdb.txt * Do a line-by-line comparison of this and sample-bdb.txt
*/ */
/* /*
* Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>. * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
* Copyright 2017 Ilya Shipitsin <chipitsine@gmail.com>. * Copyright 2017 Ilya Shipitsin <chipitsine@gmail.com>.
* Copyright 2012-2015 Howard Chu, Symas Corp. * Copyright 2012-2015 Howard Chu, Symas Corp.
* All rights reserved. * All rights reserved.
@ -18,14 +18,7 @@
* <http://www.OpenLDAP.org/license.html>. * <http://www.OpenLDAP.org/license.html>.
*/ */
#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && !defined(__USE_MINGW_ANSI_STDIO)
#define __USE_MINGW_ANSI_STDIO 1
#endif /* MinGW */
#include "mdbx.h" #include "mdbx.h"
#include <limits.h>
#include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -41,49 +34,13 @@ int main(int argc, char *argv[]) {
MDBX_cursor *cursor = NULL; MDBX_cursor *cursor = NULL;
char sval[32]; char sval[32];
printf("MDBX limits:\n");
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
const double scale_factor = 1099511627776.0;
const char *const scale_unit = "TiB";
#else
const double scale_factor = 1073741824.0;
const char *const scale_unit = "GiB";
#endif
const size_t pagesize_min = mdbx_limits_pgsize_min();
const size_t pagesize_max = mdbx_limits_pgsize_max();
const size_t pagesize_default = mdbx_default_pagesize();
printf("\tPage size: a power of 2, minimum %zu, maximum %zu bytes,"
" default %zu bytes.\n",
pagesize_min, pagesize_max, pagesize_default);
printf("\tKey size: minimum %zu, maximum ≈¼ pagesize (%zu bytes for default"
" %zuK pagesize, %zu bytes for %zuK pagesize).\n",
(size_t)0, mdbx_limits_keysize_max(-1, MDBX_DB_DEFAULTS), pagesize_default / 1024,
mdbx_limits_keysize_max(pagesize_max, MDBX_DB_DEFAULTS), pagesize_max / 1024);
printf("\tValue size: minimum %zu, maximum %zu (0x%08zX) bytes for maps,"
" ≈¼ pagesize for multimaps (%zu bytes for default %zuK pagesize,"
" %zu bytes for %zuK pagesize).\n",
(size_t)0, mdbx_limits_valsize_max(pagesize_min, MDBX_DB_DEFAULTS),
mdbx_limits_valsize_max(pagesize_min, MDBX_DB_DEFAULTS), mdbx_limits_valsize_max(-1, MDBX_DUPSORT),
pagesize_default / 1024, mdbx_limits_valsize_max(pagesize_max, MDBX_DUPSORT), pagesize_max / 1024);
printf("\tWrite transaction size: up to %zu (0x%zX) pages (%f %s for default "
"%zuK pagesize, %f %s for %zuK pagesize).\n",
mdbx_limits_txnsize_max(pagesize_min) / pagesize_min, mdbx_limits_txnsize_max(pagesize_min) / pagesize_min,
mdbx_limits_txnsize_max(-1) / scale_factor, scale_unit, pagesize_default / 1024,
mdbx_limits_txnsize_max(pagesize_max) / scale_factor, scale_unit, pagesize_max / 1024);
printf("\tDatabase size: up to %zu pages (%f %s for default %zuK "
"pagesize, %f %s for %zuK pagesize).\n",
mdbx_limits_dbsize_max(pagesize_min) / pagesize_min, mdbx_limits_dbsize_max(-1) / scale_factor, scale_unit,
pagesize_default / 1024, mdbx_limits_dbsize_max(pagesize_max) / scale_factor, scale_unit, pagesize_max / 1024);
printf("\tMaximum sub-databases: %u.\n", MDBX_MAX_DBI);
printf("-----\n");
rc = mdbx_env_create(&env); rc = mdbx_env_create(&env);
if (rc != MDBX_SUCCESS) { if (rc != MDBX_SUCCESS) {
fprintf(stderr, "mdbx_env_create: (%d) %s\n", rc, mdbx_strerror(rc)); fprintf(stderr, "mdbx_env_create: (%d) %s\n", rc, mdbx_strerror(rc));
goto bailout; goto bailout;
} }
rc = mdbx_env_open(env, "./example-db", MDBX_NOSUBDIR | MDBX_LIFORECLAIM, 0664); rc = mdbx_env_open(env, "./example-db",
MDBX_NOSUBDIR | MDBX_COALESCE | MDBX_LIFORECLAIM, 0664);
if (rc != MDBX_SUCCESS) { if (rc != MDBX_SUCCESS) {
fprintf(stderr, "mdbx_env_open: (%d) %s\n", rc, mdbx_strerror(rc)); fprintf(stderr, "mdbx_env_open: (%d) %s\n", rc, mdbx_strerror(rc));
goto bailout; goto bailout;
@ -118,7 +75,7 @@ int main(int argc, char *argv[]) {
} }
txn = NULL; txn = NULL;
rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &txn); rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn);
if (rc != MDBX_SUCCESS) { if (rc != MDBX_SUCCESS) {
fprintf(stderr, "mdbx_txn_begin: (%d) %s\n", rc, mdbx_strerror(rc)); fprintf(stderr, "mdbx_txn_begin: (%d) %s\n", rc, mdbx_strerror(rc));
goto bailout; goto bailout;
@ -131,8 +88,9 @@ int main(int argc, char *argv[]) {
int found = 0; int found = 0;
while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == 0) { while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == 0) {
printf("key: %p %.*s, data: %p %.*s\n", key.iov_base, (int)key.iov_len, (char *)key.iov_base, data.iov_base, printf("key: %p %.*s, data: %p %.*s\n", key.iov_base, (int)key.iov_len,
(int)data.iov_len, (char *)data.iov_base); (char *)key.iov_base, data.iov_base, (int)data.iov_len,
(char *)data.iov_base);
found += 1; found += 1;
} }
if (rc != MDBX_NOTFOUND || found == 0) { if (rc != MDBX_NOTFOUND || found == 0) {

View File

@ -4,7 +4,7 @@
*/ */
/* /*
* Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>. * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
* Copyright 2012-2015 Howard Chu, Symas Corp. * Copyright 2012-2015 Howard Chu, Symas Corp.
* Copyright 2015,2016 Peter-Service R&D LLC. * Copyright 2015,2016 Peter-Service R&D LLC.
* All rights reserved. * All rights reserved.

8776
mdbx.h

File diff suppressed because it is too large Load Diff

6428
mdbx.h++

File diff suppressed because it is too large Load Diff

View File

@ -1,173 +0,0 @@
From 349c08cf21b66ecea851340133a1b845c25675f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?=
=?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= <leo@yuriev.ru>
Date: Tue, 22 Apr 2025 14:38:49 +0300
Subject: [PATCH] package/libmdbx: new package (library/database).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch adds libmdbx:
- libmdbx is one of the fastest compact embeddable key-value ACID database.
- libmdbx has a specific set of properties and capabilities,
focused on creating unique lightweight solutions.
- libmdbx surpasses the legendary LMDB (Lightning Memory-Mapped Database)
in terms of reliability, features and performance.
- more information at https://libmdbx.dqdkfa.ru
The 0.13.6 "Бузина" (Elderberry) is stable release of _libmdbx_ branch with new superior features.
The complete ChangeLog: https://gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md
Signed-off-by: Леонид Юрьев (Leonid Yuriev) <leo@yuriev.ru>
---
DEVELOPERS | 3 +++
package/Config.in | 1 +
package/libmdbx/Config.in | 45 ++++++++++++++++++++++++++++++++++++
package/libmdbx/libmdbx.hash | 6 +++++
package/libmdbx/libmdbx.mk | 42 +++++++++++++++++++++++++++++++++
5 files changed, 97 insertions(+)
create mode 100644 package/libmdbx/Config.in
create mode 100644 package/libmdbx/libmdbx.hash
create mode 100644 package/libmdbx/libmdbx.mk
diff --git a/DEVELOPERS b/DEVELOPERS
index 9ab1e125f4..758ff6a2d5 100644
--- a/DEVELOPERS
+++ b/DEVELOPERS
@@ -1482,6 +1482,9 @@ N: Leon Anavi <leon.anavi@konsulko.com>
F: board/olimex/a10_olinuxino
F: configs/olimex_a10_olinuxino_lime_defconfig
+N: Leonid Yuriev <leo@yuriev.ru>
+F: package/libmdbx/
+
N: Lionel Flandrin <lionel@svkt.org>
F: package/python-babel/
F: package/python-daemonize/
diff --git a/package/Config.in b/package/Config.in
index 016a99ed1a..a6f95bfaa9 100644
--- a/package/Config.in
+++ b/package/Config.in
@@ -1372,6 +1372,7 @@ menu "Database"
source "package/kompexsqlite/Config.in"
source "package/leveldb/Config.in"
source "package/libgit2/Config.in"
+ source "package/libmdbx/Config.in"
source "package/libodb/Config.in"
source "package/libodb-boost/Config.in"
source "package/libodb-mysql/Config.in"
diff --git a/package/libmdbx/Config.in b/package/libmdbx/Config.in
new file mode 100644
index 0000000000..a9a4ac45c5
--- /dev/null
+++ b/package/libmdbx/Config.in
@@ -0,0 +1,45 @@
+config BR2_PACKAGE_LIBMDBX
+ bool "libmdbx"
+ depends on BR2_USE_MMU
+ depends on BR2_TOOLCHAIN_HAS_SYNC_4
+ depends on BR2_TOOLCHAIN_HAS_THREADS
+ depends on BR2_TOOLCHAIN_GCC_AT_LEAST_4_4
+ help
+ One of the fastest compact key-value ACID database
+ without WAL. libmdbx has a specific set of properties
+ and capabilities, focused on creating unique lightweight
+ solutions.
+
+ libmdbx surpasses the legendary LMDB in terms of
+ reliability, features and performance.
+
+ https://libmdbx.dqdkfa.ru
+
+if BR2_PACKAGE_LIBMDBX
+
+config BR2_PACKAGE_LIBMDBX_TOOLS
+ bool "install tools"
+ help
+ Install libmdbx tools for checking, dump, restore
+ and show statistics of databases.
+
+config BR2_PACKAGE_LIBMDBX_CXX
+ bool "C++ API"
+ depends on BR2_INSTALL_LIBSTDCPP
+ depends on BR2_TOOLCHAIN_GCC_AT_LEAST_4_8
+ depends on !BR2_TOOLCHAIN_HAS_GCC_BUG_64735
+ help
+ Enable modern C++11/14/17/20 API for libmdbx.
+
+comment "libmdbx C++ support needs a toolchain w/ C++11, gcc >= 4.8 w/o bug#64735"
+ depends on !BR2_INSTALL_LIBSTDCPP || \
+ !BR2_TOOLCHAIN_GCC_AT_LEAST_4_8 || \
+ BR2_TOOLCHAIN_HAS_GCC_BUG_64735
+
+endif
+
+comment "libmdbx needs MMU, a toolchain w/ threads, gcc >= 4.4 w/ 4-byte atomics"
+ depends on BR2_USE_MMU
+ depends on !BR2_TOOLCHAIN_HAS_THREADS || \
+ !BR2_TOOLCHAIN_HAS_SYNC_4 || \
+ !BR2_TOOLCHAIN_GCC_AT_LEAST_4_4
diff --git a/package/libmdbx/libmdbx.hash b/package/libmdbx/libmdbx.hash
new file mode 100644
index 0000000000..ae5266716b
--- /dev/null
+++ b/package/libmdbx/libmdbx.hash
@@ -0,0 +1,6 @@
+# Hashes from: https://libmdbx.dqdkfa.ru/release/SHA256SUMS
+sha256 57db987de6f7ccc66a66ae28a7bda9f9fbb48ac5fb9279bcca92fd5de13075d1 libmdbx-amalgamated-0.13.6.tar.xz
+
+# Locally calculated
+sha256 0d542e0c8804e39aa7f37eb00da5a762149dc682d7829451287e11b938e94594 LICENSE
+sha256 651f71b46c6bb0046d2122df7f9def9cb24f4dc28c5b11cef059f66565cda30f NOTICE
diff --git a/package/libmdbx/libmdbx.mk b/package/libmdbx/libmdbx.mk
new file mode 100644
index 0000000000..571757262e
--- /dev/null
+++ b/package/libmdbx/libmdbx.mk
@@ -0,0 +1,42 @@
+################################################################################
+#
+# libmdbx
+#
+################################################################################
+
+LIBMDBX_VERSION = 0.13.6
+LIBMDBX_SOURCE = libmdbx-amalgamated-$(LIBMDBX_VERSION).tar.xz
+LIBMDBX_SITE = https://libmdbx.dqdkfa.ru/release
+LIBMDBX_SUPPORTS_IN_SOURCE_BUILD = NO
+LIBMDBX_LICENSE = Apache-2.0
+LIBMDBX_LICENSE_FILES = LICENSE NOTICE
+LIBMDBX_REDISTRIBUTE = YES
+LIBMDBX_STRIP_COMPONENTS = 0
+LIBMDBX_INSTALL_STAGING = YES
+
+# Set CMAKE_BUILD_TYPE to Release to remove -Werror and avoid a build failure
+# with glibc < 2.12
+LIBMDBX_CONF_OPTS = \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DMDBX_INSTALL_MANPAGES=OFF \
+ -DBUILD_FOR_NATIVE_CPU=OFF \
+ -DMDBX_BUILD_CXX=$(if $(BR2_PACKAGE_LIBMDBX_CXX),ON,OFF) \
+ -DMDBX_BUILD_TOOLS=$(if $(BR2_PACKAGE_LIBMDBX_TOOLS),ON,OFF)
+
+ifeq ($(BR2_STATIC_LIBS)$(BR2_SHARED_STATIC_LIBS),y)
+LIBMDBX_CONF_OPTS += -DMDBX_INSTALL_STATIC=ON
+else
+LIBMDBX_CONF_OPTS += -DMDBX_INSTALL_STATIC=OFF
+endif
+
+ifeq ($(BR2_SHARED_LIBS)$(BR2_SHARED_STATIC_LIBS),y)
+LIBMDBX_CONF_OPTS += \
+ -DMDBX_BUILD_SHARED_LIBRARY=ON \
+ -DMDBX_LINK_TOOLS_NONSTATIC=ON
+else
+LIBMDBX_CONF_OPTS += \
+ -DMDBX_BUILD_SHARED_LIBRARY=OFF \
+ -DMDBX_LINK_TOOLS_NONSTATIC=OFF
+endif
+
+$(eval $(cmake-package))
--
2.49.0

184
packages/rpm/CMakeLists.txt Normal file
View File

@ -0,0 +1,184 @@
cmake_minimum_required(VERSION 2.8.7)
set(TARGET mdbx)
project(${TARGET})
set(MDBX_VERSION_MAJOR 0)
set(MDBX_VERSION_MINOR 3)
set(MDBX_VERSION_RELEASE 1)
set(MDBX_VERSION_REVISION 0)
set(MDBX_VERSION_STRING ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE})
enable_language(C)
enable_language(CXX)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED on)
add_definitions(-DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 -D_GNU_SOURCE=1)
find_package(Threads REQUIRED)
get_directory_property(hasParent PARENT_DIRECTORY)
if(hasParent)
set(STANDALONE_BUILD 0)
else()
set(STANDALONE_BUILD 1)
enable_testing()
if (CMAKE_C_COMPILER_ID MATCHES GNU)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wextra")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wpointer-arith")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat-security")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Woverloaded-virtual")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wwrite-strings")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmax-errors=20")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wunused-function -Wunused-variable -Wunused-value -Wmissing-declarations")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wcast-qual")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finline-functions-called-once")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-packed-bitfield-compat")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g3")
endif()
if (COVERAGE)
if (NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
message(FATAL_ERROR "Coverage requires -DCMAKE_BUILD_TYPE=Debug Current value=${CMAKE_BUILD_TYPE}")
endif()
message(STATUS "Setting coverage compiler flags")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ggdb3 -O0 --coverage -fprofile-arcs -ftest-coverage")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -ggdb3 -O0 --coverage -fprofile-arcs -ftest-coverage")
add_definitions(-DCOVERAGE_TEST)
endif()
if (NOT TRAVIS)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=leak -fstack-protector-strong -static-libasan")
endif()
endif()
set(${TARGET}_SRC
mdbx.h
src/bits.h
src/defs.h
src/lck-linux.c
src/mdbx.c
src/osal.c
src/osal.h
src/version.c
)
add_library(${TARGET}_STATIC STATIC
${${TARGET}_SRC}
)
add_library(${TARGET} ALIAS ${TARGET}_STATIC)
add_library(${TARGET}_SHARED SHARED
${${TARGET}_SRC}
)
set_target_properties(${TARGET}_SHARED PROPERTIES
VERSION ${MDBX_VERSION_STRING}
SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}
OUTPUT_NAME ${TARGET}
CLEAN_DIRECT_OUTPUT 1
)
set_target_properties(${TARGET}_STATIC PROPERTIES
VERSION ${MDBX_VERSION_STRING}
SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}
OUTPUT_NAME ${TARGET}
CLEAN_DIRECT_OUTPUT 1
)
target_include_directories(${TARGET}_STATIC PUBLIC
${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(${TARGET}_SHARED PUBLIC
${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${TARGET}_STATIC ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET}_SHARED ${CMAKE_THREAD_LIBS_INIT})
if(UNIX AND NOT APPLE)
target_link_libraries(${TARGET}_STATIC rt)
target_link_libraries(${TARGET}_SHARED rt)
endif()
install(TARGETS ${TARGET}_STATIC DESTINATION ${CMAKE_INSTALL_PREFIX}/lib64 COMPONENT mdbx)
install(TARGETS ${TARGET}_SHARED DESTINATION ${CMAKE_INSTALL_PREFIX}/lib64 COMPONENT mdbx)
install(FILES mdbx.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include COMPONENT mdbx-devel)
add_subdirectory(src/tools)
add_subdirectory(test)
add_subdirectory(test/pcrf)
add_subdirectory(tutorial)
##############################################################################
set(CPACK_GENERATOR "RPM")
set(CPACK_RPM_COMPONENT_INSTALL ON)
# Version
if (NOT "$ENV{BUILD_NUMBER}" STREQUAL "")
set(CPACK_PACKAGE_RELEASE $ENV{BUILD_NUMBER})
else()
if (NOT "$ENV{CI_PIPELINE_ID}" STREQUAL "")
set(CPACK_PACKAGE_RELEASE $ENV{CI_PIPELINE_ID})
else()
set(CPACK_PACKAGE_RELEASE 1)
endif()
endif()
set(CPACK_RPM_PACKAGE_RELEASE ${CPACK_PACKAGE_RELEASE})
set(CPACK_PACKAGE_VERSION ${MDBX_VERSION_STRING})
set(CPACK_PACKAGE_VERSION_FULL ${CPACK_PACKAGE_VERSION}-${CPACK_PACKAGE_RELEASE})
set(CPACK_RPM_mdbx-devel_PACKAGE_REQUIRES "mdbx = ${CPACK_PACKAGE_VERSION}")
set(CPACK_RPM_SPEC_INSTALL_POST "/bin/true")
set(CPACK_RPM_mdbx_PACKAGE_NAME mdbx)
set(CPACK_RPM_mdbx-devel_PACKAGE_NAME mdbx-devel)
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "The revised and extended descendant of Symas LMDB")
set(CPACK_PACKAGE_VENDOR "???")
set(CPACK_PACKAGE_CONTACT "Vladimir Romanov")
set(CPACK_PACKAGE_RELOCATABLE false)
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_PACKAGE_REQUIRES "")
set(CPACK_RPM_PACKAGE_GROUP "Applications/Database")
set(CPACK_RPM_mdbx_FILE_NAME "${CPACK_RPM_mdbx_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_FULL}.${CPACK_RPM_PACKAGE_ARCHITECTURE}.rpm")
set(CPACK_RPM_mdbx-devel_FILE_NAME "${CPACK_RPM_mdbx-devel_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_FULL}.${CPACK_RPM_PACKAGE_ARCHITECTURE}.rpm")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
/usr/local
/usr/local/bin
/usr/local/lib64
/usr/local/include
/usr/local/man
/usr/local/man/man1
)
include(CPack)

18
packages/rpm/build.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/bash
set -e
CONFIG=$1
if [[ -z "${CONFIG}" ]]; then
CONFIG=Debug
fi
if [[ -r /opt/rh/devtoolset-6/enable ]]; then
source /opt/rh/devtoolset-6/enable
fi
#rm -f -r build || true
mkdir -p cmake-build-${CONFIG}
pushd cmake-build-${CONFIG} &> /dev/null
if [[ ! -r Makefile ]]; then
cmake .. -DCMAKE_BUILD_TYPE=${CONFIG}
fi
make -j8 || exit 1
popd &> /dev/null

25
packages/rpm/package.sh Executable file
View File

@ -0,0 +1,25 @@
#!/bin/bash
set -e
CONFIG=$1
if [[ -z "${CONFIG}" ]]; then
CONFIG=Debug
fi
DIRNAME=`dirname ${BASH_SOURCE[0]}`
DIRNAME=`readlink --canonicalize ${DIRNAME}`
if [[ -r /opt/rh/devtoolset-6/enable ]]; then
source /opt/rh/devtoolset-6/enable
fi
mkdir -p cmake-build-${CONFIG}
pushd cmake-build-${CONFIG} &> /dev/null
if [[ ! -r Makefile ]]; then
cmake .. -DCMAKE_BUILD_TYPE=${CONFIG}
fi
rm -f *.rpm
make -j8 package || exit 1
rm -f *-Unspecified.rpm
popd &> /dev/null

View File

@ -1,58 +1,25 @@
/// \copyright SPDX-License-Identifier: Apache-2.0 /*
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025 * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#define xMDBX_ALLOY 1 /* alloyed build */ #define MDBX_ALLOY 1 /* alloyed build */
#include "internals.h" /* must be included first */ #include "internals.h" /* must be included first */
#include "api-cold.c" #include "core.c"
#include "api-copy.c"
#include "api-cursor.c"
#include "api-dbi.c"
#include "api-env.c"
#include "api-extra.c"
#include "api-key-transform.c"
#include "api-misc.c"
#include "api-opts.c"
#include "api-range-estimate.c"
#include "api-txn-data.c"
#include "api-txn.c"
#include "audit.c"
#include "chk.c"
#include "cogs.c"
#include "coherency.c"
#include "cursor.c"
#include "dbi.c"
#include "dpl.c"
#include "dxb.c"
#include "env.c"
#include "gc-get.c"
#include "gc-put.c"
#include "global.c"
#include "lck-posix.c"
#include "lck-windows.c"
#include "lck.c"
#include "logging_and_debug.c"
#include "meta.c"
#include "mvcc-readers.c"
#include "node.c"
#include "osal.c" #include "osal.c"
#include "page-get.c"
#include "page-iov.c"
#include "page-ops.c"
#include "pnl.c"
#include "refund.c"
#include "rkl.c"
#include "spill.c"
#include "table.c"
#include "tls.c"
#include "tree-ops.c"
#include "tree-search.c"
#include "txl.c"
#include "txn-basal.c"
#include "txn-nested.c"
#include "txn-ro.c"
#include "txn.c"
#include "utils.c"
#include "version.c" #include "version.c"
#include "walk.c"
#include "windows-import.c" #if defined(_WIN32) || defined(_WIN64)
#include "lck-windows.c"
#else
#include "lck-posix.c"
#endif

View File

@ -1,543 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
__cold size_t mdbx_default_pagesize(void) {
size_t pagesize = globals.sys_pagesize;
ENSURE(nullptr, is_powerof2(pagesize));
pagesize = (pagesize >= MDBX_MIN_PAGESIZE) ? pagesize : MDBX_MIN_PAGESIZE;
pagesize = (pagesize <= MDBX_MAX_PAGESIZE) ? pagesize : MDBX_MAX_PAGESIZE;
return pagesize;
}
__cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
return MIN_PAGENO * pagesize;
}
__cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize;
return (limit < MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE;
}
__cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
const uint64_t pgl_limit = pagesize * (uint64_t)(PAGELIST_LIMIT / MDBX_GOLD_RATIO_DBL);
const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL);
return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit;
}
__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
return keysize_max(pagesize, flags);
}
__cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_keysize_max((intptr_t)env->ps, flags);
}
__cold int mdbx_env_get_maxkeysize(const MDBX_env *env) { return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT); }
__cold intptr_t mdbx_limits_keysize_min(MDBX_db_flags_t flags) { return keysize_min(flags); }
__cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
return valsize_max(pagesize, flags);
}
__cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_valsize_max((intptr_t)env->ps, flags);
}
__cold intptr_t mdbx_limits_valsize_min(MDBX_db_flags_t flags) { return valsize_min(flags); }
__cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP))
return BRANCH_NODE_MAX(pagesize) - NODESIZE;
return LEAF_NODE_MAX(pagesize) - NODESIZE;
}
__cold int mdbx_env_get_pairsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_pairsize4page_max((intptr_t)env->ps, flags);
}
__cold intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags) {
if (pagesize < 1)
pagesize = (intptr_t)mdbx_default_pagesize();
if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || pagesize > (intptr_t)MDBX_MAX_PAGESIZE ||
!is_powerof2((size_t)pagesize)))
return -1;
if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP))
return valsize_max(pagesize, flags);
return PAGESPACE(pagesize);
}
__cold int mdbx_env_get_valsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags) {
if (unlikely(!env || env->signature.weak != env_signature))
return -1;
return (int)mdbx_limits_valsize4page_max((intptr_t)env->ps, flags);
}
/*----------------------------------------------------------------------------*/
static size_t estimate_rss(size_t database_bytes) {
return database_bytes + database_bytes / 64 + (512 + MDBX_WORDBITS * 16) * MEGABYTE;
}
__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, MDBX_warmup_flags_t flags,
unsigned timeout_seconds_16dot16) {
if (unlikely(env == nullptr && txn == nullptr))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(flags > (MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock | MDBX_warmup_touchlimit |
MDBX_warmup_release)))
return LOG_IFERR(MDBX_EINVAL);
if (txn) {
int err = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_ERROR);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
}
if (env) {
int err = check_env(env, false);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
if (txn && unlikely(txn->env != env))
return LOG_IFERR(MDBX_EINVAL);
} else {
env = txn->env;
}
const uint64_t timeout_monotime = (timeout_seconds_16dot16 && (flags & MDBX_warmup_force))
? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16)
: 0;
if (flags & MDBX_warmup_release)
munlock_all(env);
pgno_t used_pgno;
if (txn) {
used_pgno = txn->geo.first_unallocated;
} else {
const troika_t troika = meta_tap(env);
used_pgno = meta_recent(env, &troika).ptr_v->geometry.first_unallocated;
}
const size_t used_range = pgno_align2os_bytes(env, used_pgno);
const pgno_t mlock_pgno = bytes2pgno(env, used_range);
int rc = MDBX_SUCCESS;
if (flags & MDBX_warmup_touchlimit) {
const size_t estimated_rss = estimate_rss(used_range);
#if defined(_WIN32) || defined(_WIN64)
SIZE_T current_ws_lower, current_ws_upper;
if (GetProcessWorkingSetSize(GetCurrentProcess(), &current_ws_lower, &current_ws_upper) &&
current_ws_lower < estimated_rss) {
const SIZE_T ws_lower = estimated_rss;
const SIZE_T ws_upper =
(MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048) ? ws_lower : ws_lower + MDBX_WORDBITS * MEGABYTE * 32;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) {
rc = (int)GetLastError();
WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower, ws_upper, rc);
}
}
#endif /* Windows */
#ifdef RLIMIT_RSS
struct rlimit rss;
if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) {
rss.rlim_cur = estimated_rss;
if (rss.rlim_max < estimated_rss)
rss.rlim_max = estimated_rss;
if (setrlimit(RLIMIT_RSS, &rss)) {
rc = errno;
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS", (size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc);
}
}
#endif /* RLIMIT_RSS */
#ifdef RLIMIT_MEMLOCK
if (flags & MDBX_warmup_lock) {
struct rlimit memlock;
if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 && memlock.rlim_cur < estimated_rss) {
memlock.rlim_cur = estimated_rss;
if (memlock.rlim_max < estimated_rss)
memlock.rlim_max = estimated_rss;
if (setrlimit(RLIMIT_MEMLOCK, &memlock)) {
rc = errno;
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK", (size_t)memlock.rlim_cur,
(size_t)memlock.rlim_max, rc);
}
}
}
#endif /* RLIMIT_MEMLOCK */
(void)estimated_rss;
}
#if defined(MLOCK_ONFAULT) && \
((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || (defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \
(defined(__linux__) || defined(__gnu_linux__))
if ((flags & MDBX_warmup_lock) != 0 && globals.linux_kernel_version >= 0x04040000 &&
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
if (mlock2(env->dxb_mmap.base, used_range, MLOCK_ONFAULT)) {
rc = errno;
WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc);
} else {
update_mlcnt(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
}
if (rc != EINVAL)
flags -= MDBX_warmup_lock;
}
#endif /* MLOCK_ONFAULT */
int err = MDBX_ENOSYS;
err = dxb_set_readahead(env, used_pgno, true, true);
if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS)
rc = err;
if ((flags & MDBX_warmup_force) != 0 && (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) {
const volatile uint8_t *ptr = env->dxb_mmap.base;
size_t offset = 0, unused = 42;
#if !(defined(_WIN32) || defined(_WIN64))
if (flags & MDBX_warmup_oomsafe) {
const int null_fd = open("/dev/null", O_WRONLY);
if (unlikely(null_fd < 0))
rc = errno;
else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
for (;;) {
unsigned i;
for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) {
iov[i].iov_base = (void *)(ptr + offset);
iov[i].iov_len = 1;
offset += globals.sys_pagesize;
}
if (unlikely(writev(null_fd, iov, i) < 0)) {
rc = errno;
if (rc == EFAULT)
rc = ENOMEM;
break;
}
if (offset >= used_range) {
rc = MDBX_SUCCESS;
break;
}
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
rc = MDBX_RESULT_TRUE;
break;
}
}
close(null_fd);
}
} else
#endif /* Windows */
for (;;) {
unused += ptr[offset];
offset += globals.sys_pagesize;
if (offset >= used_range) {
rc = MDBX_SUCCESS;
break;
}
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
rc = MDBX_RESULT_TRUE;
break;
}
}
(void)unused;
}
if ((flags & MDBX_warmup_lock) != 0 && (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) &&
atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
#if defined(_WIN32) || defined(_WIN64)
if (VirtualLock(env->dxb_mmap.base, used_range)) {
update_mlcnt(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
} else {
rc = (int)GetLastError();
WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc);
}
#elif defined(_POSIX_MEMLOCK_RANGE)
if (mlock(env->dxb_mmap.base, used_range) == 0) {
update_mlcnt(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
} else {
rc = errno;
WARNING("%s(%zu) error %d", "mlock", used_range, rc);
}
#else
rc = MDBX_ENOSYS;
#endif
}
return LOG_IFERR(rc);
}
/*----------------------------------------------------------------------------*/
__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!arg))
return LOG_IFERR(MDBX_EINVAL);
*arg = env->lazy_fd;
return MDBX_SUCCESS;
}
__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, bool onoff) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(flags & ((env->flags & ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS : ~ENV_USABLE_FLAGS)))
return LOG_IFERR(MDBX_EPERM);
if (unlikely(env->flags & MDBX_RDONLY))
return LOG_IFERR(MDBX_EACCESS);
const bool lock_needed = (env->flags & ENV_ACTIVE) && !env_owned_wrtxn(env);
bool should_unlock = false;
if (lock_needed) {
rc = lck_txn_lock(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
should_unlock = true;
}
if (onoff)
env->flags = combine_durability_flags(env->flags, flags);
else
env->flags &= ~flags;
if (should_unlock)
lck_txn_unlock(env);
return MDBX_SUCCESS;
}
__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!flags))
return LOG_IFERR(MDBX_EINVAL);
*flags = env->flags & ENV_USABLE_FLAGS;
return MDBX_SUCCESS;
}
__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
env->userctx = ctx;
return MDBX_SUCCESS;
}
__cold void *mdbx_env_get_userctx(const MDBX_env *env) { return env ? env->userctx : nullptr; }
__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
#if MDBX_DEBUG
env->assert_func = func;
return MDBX_SUCCESS;
#else
(void)func;
return LOG_IFERR(MDBX_ENOSYS);
#endif
}
__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) {
int rc = check_env(env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
env->hsr_callback = hsr;
return MDBX_SUCCESS;
}
__cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) {
return likely(env && env->signature.weak == env_signature) ? env->hsr_callback : nullptr;
}
#if defined(_WIN32) || defined(_WIN64)
__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!arg))
return LOG_IFERR(MDBX_EINVAL);
*arg = env->pathname.specified;
return MDBX_SUCCESS;
}
#endif /* Windows */
__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!arg))
return LOG_IFERR(MDBX_EINVAL);
#if defined(_WIN32) || defined(_WIN64)
if (!env->pathname_char) {
*arg = nullptr;
DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80;
size_t mb_len =
WideCharToMultiByte(CP_THREAD_ACP, flags, env->pathname.specified, -1, nullptr, 0, nullptr, nullptr);
rc = mb_len ? MDBX_SUCCESS : (int)GetLastError();
if (rc == ERROR_INVALID_FLAGS) {
mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->pathname.specified, -1, nullptr, 0, nullptr, nullptr);
rc = mb_len ? MDBX_SUCCESS : (int)GetLastError();
}
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
char *const mb_pathname = osal_malloc(mb_len);
if (!mb_pathname)
return LOG_IFERR(MDBX_ENOMEM);
if (mb_len != (size_t)WideCharToMultiByte(CP_THREAD_ACP, flags, env->pathname.specified, -1, mb_pathname,
(int)mb_len, nullptr, nullptr)) {
rc = (int)GetLastError();
osal_free(mb_pathname);
return LOG_IFERR(rc);
}
if (env->pathname_char ||
InterlockedCompareExchangePointer((PVOID volatile *)&env->pathname_char, mb_pathname, nullptr))
osal_free(mb_pathname);
}
*arg = env->pathname_char;
#else
*arg = env->pathname.specified;
#endif /* Windows */
return MDBX_SUCCESS;
}
/*------------------------------------------------------------------------------
* Legacy API */
#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret) {
return __inline_mdbx_txn_begin(env, parent, flags, ret);
}
LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); }
LIBMDBX_API __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) {
return __inline_mdbx_env_stat(env, stat, bytes);
}
LIBMDBX_API __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, size_t bytes) {
return __inline_mdbx_env_info(env, info, bytes);
}
LIBMDBX_API int mdbx_dbi_flags(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) {
return __inline_mdbx_dbi_flags(txn, dbi, flags);
}
LIBMDBX_API __cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); }
LIBMDBX_API __cold int mdbx_env_sync_poll(MDBX_env *env) { return __inline_mdbx_env_sync_poll(env); }
LIBMDBX_API __cold int mdbx_env_close(MDBX_env *env) { return __inline_mdbx_env_close(env); }
LIBMDBX_API __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) {
return __inline_mdbx_env_set_mapsize(env, size);
}
LIBMDBX_API __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) {
return __inline_mdbx_env_set_maxdbs(env, dbs);
}
LIBMDBX_API __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) {
return __inline_mdbx_env_get_maxdbs(env, dbs);
}
LIBMDBX_API __cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) {
return __inline_mdbx_env_set_maxreaders(env, readers);
}
LIBMDBX_API __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) {
return __inline_mdbx_env_get_maxreaders(env, readers);
}
LIBMDBX_API __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) {
return __inline_mdbx_env_set_syncbytes(env, threshold);
}
LIBMDBX_API __cold int mdbx_env_get_syncbytes(const MDBX_env *env, size_t *threshold) {
return __inline_mdbx_env_get_syncbytes(env, threshold);
}
LIBMDBX_API __cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) {
return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16);
}
LIBMDBX_API __cold int mdbx_env_get_syncperiod(const MDBX_env *env, unsigned *seconds_16dot16) {
return __inline_mdbx_env_get_syncperiod(env, seconds_16dot16);
}
LIBMDBX_API __cold uint64_t mdbx_key_from_int64(const int64_t i64) { return __inline_mdbx_key_from_int64(i64); }
LIBMDBX_API __cold uint32_t mdbx_key_from_int32(const int32_t i32) { return __inline_mdbx_key_from_int32(i32); }
LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_min(void) { return __inline_mdbx_limits_pgsize_min(); }
LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_max(void) { return __inline_mdbx_limits_pgsize_max(); }
#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */

View File

@ -1,880 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
typedef struct compacting_context {
MDBX_env *env;
MDBX_txn *txn;
MDBX_copy_flags_t flags;
pgno_t first_unallocated;
osal_condpair_t condpair;
volatile unsigned head;
volatile unsigned tail;
uint8_t *write_buf[2];
size_t write_len[2];
/* Error code. Never cleared if set. Both threads can set nonzero
* to fail the copy. Not mutex-protected, expects atomic int. */
volatile int error;
mdbx_filehandle_t fd;
} ctx_t;
__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree);
/* Dedicated writer thread for compacting copy. */
__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
ctx_t *const ctx = arg;
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
sigset_t sigset;
sigemptyset(&sigset);
sigaddset(&sigset, SIGPIPE);
ctx->error = pthread_sigmask(SIG_BLOCK, &sigset, nullptr);
#endif /* EPIPE */
osal_condpair_lock(&ctx->condpair);
while (!ctx->error) {
while (ctx->tail == ctx->head && !ctx->error) {
int err = osal_condpair_wait(&ctx->condpair, true);
if (err != MDBX_SUCCESS) {
ctx->error = err;
goto bailout;
}
}
const unsigned toggle = ctx->tail & 1;
size_t wsize = ctx->write_len[toggle];
if (wsize == 0) {
ctx->tail += 1;
break /* EOF */;
}
ctx->write_len[toggle] = 0;
uint8_t *ptr = ctx->write_buf[toggle];
if (!ctx->error) {
int err = osal_write(ctx->fd, ptr, wsize);
if (err != MDBX_SUCCESS) {
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
if (err == EPIPE) {
/* Collect the pending SIGPIPE,
* otherwise at least OS X gives it to the process on thread-exit. */
int unused;
sigwait(&sigset, &unused);
}
#endif /* EPIPE */
ctx->error = err;
goto bailout;
}
}
ctx->tail += 1;
osal_condpair_signal(&ctx->condpair, false);
}
bailout:
osal_condpair_unlock(&ctx->condpair);
return (THREAD_RESULT)0;
}
/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
__cold static int compacting_toggle_write_buffers(ctx_t *ctx) {
osal_condpair_lock(&ctx->condpair);
eASSERT(ctx->env, ctx->head - ctx->tail < 2 || ctx->error);
ctx->head += 1;
osal_condpair_signal(&ctx->condpair, true);
while (!ctx->error && ctx->head - ctx->tail == 2 /* both buffers in use */) {
if (ctx->flags & MDBX_CP_THROTTLE_MVCC)
mdbx_txn_park(ctx->txn, false);
int err = osal_condpair_wait(&ctx->condpair, false);
if (err == MDBX_SUCCESS && (ctx->flags & MDBX_CP_THROTTLE_MVCC) != 0)
err = mdbx_txn_unpark(ctx->txn, false);
if (err != MDBX_SUCCESS)
ctx->error = err;
}
osal_condpair_unlock(&ctx->condpair);
return ctx->error;
}
static int compacting_put_bytes(ctx_t *ctx, const void *src, size_t bytes, pgno_t pgno, pgno_t npages) {
assert(pgno == 0 || bytes > PAGEHDRSZ);
while (bytes > 0) {
const size_t side = ctx->head & 1;
const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->write_len[side];
if (left < (pgno ? PAGEHDRSZ : 1)) {
int err = compacting_toggle_write_buffers(ctx);
if (unlikely(err != MDBX_SUCCESS))
return err;
continue;
}
const size_t chunk = (bytes < left) ? bytes : left;
void *const dst = ctx->write_buf[side] + ctx->write_len[side];
if (src) {
memcpy(dst, src, chunk);
if (pgno) {
assert(chunk > PAGEHDRSZ);
page_t *mp = dst;
mp->pgno = pgno;
if (mp->txnid == 0)
mp->txnid = ctx->txn->txnid;
if (mp->flags == P_LARGE) {
assert(bytes <= pgno2bytes(ctx->env, npages));
mp->pages = npages;
}
pgno = 0;
}
src = ptr_disp(src, chunk);
} else
memset(dst, 0, chunk);
bytes -= chunk;
ctx->write_len[side] += chunk;
}
return MDBX_SUCCESS;
}
static int compacting_put_page(ctx_t *ctx, const page_t *mp, const size_t head_bytes, const size_t tail_bytes,
const pgno_t npages) {
if (tail_bytes) {
assert(head_bytes + tail_bytes <= ctx->env->ps);
assert(npages == 1 && (page_type(mp) == P_BRANCH || page_type(mp) == P_LEAF));
} else {
assert(head_bytes <= pgno2bytes(ctx->env, npages));
assert((npages == 1 && page_type(mp) == (P_LEAF | P_DUPFIX)) || page_type(mp) == P_LARGE);
}
const pgno_t pgno = ctx->first_unallocated;
ctx->first_unallocated += npages;
int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages);
if (unlikely(err != MDBX_SUCCESS))
return err;
err = compacting_put_bytes(ctx, nullptr, pgno2bytes(ctx->env, npages) - (head_bytes + tail_bytes), 0, 0);
if (unlikely(err != MDBX_SUCCESS))
return err;
return compacting_put_bytes(ctx, ptr_disp(mp, ctx->env->ps - tail_bytes), tail_bytes, 0, 0);
}
__cold static int compacting_walk(ctx_t *ctx, MDBX_cursor *mc, pgno_t *const parent_pgno, txnid_t parent_txnid) {
mc->top = 0;
mc->ki[0] = 0;
int rc = page_get(mc, *parent_pgno, &mc->pg[0], parent_txnid);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = tree_search_finalize(mc, nullptr, Z_FIRST);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
/* Make cursor pages writable */
const intptr_t deep_limit = mc->top + 1;
void *const buf = osal_malloc(pgno2bytes(ctx->env, deep_limit + 1));
if (buf == nullptr)
return MDBX_ENOMEM;
void *ptr = buf;
for (intptr_t i = 0; i <= mc->top; i++) {
page_copy(ptr, mc->pg[i], ctx->env->ps);
mc->pg[i] = ptr;
ptr = ptr_disp(ptr, ctx->env->ps);
}
/* This is writable space for a leaf page. Usually not needed. */
page_t *const leaf = ptr;
while (mc->top >= 0) {
page_t *mp = mc->pg[mc->top];
const size_t nkeys = page_numkeys(mp);
if (is_leaf(mp)) {
if (!(mc->flags & z_inner) /* may have nested N_TREE or N_BIG nodes */) {
for (size_t i = 0; i < nkeys; i++) {
node_t *node = page_node(mp, i);
if (node_flags(node) == N_BIG) {
/* Need writable leaf */
if (mp != leaf) {
mc->pg[mc->top] = leaf;
page_copy(leaf, mp, ctx->env->ps);
mp = leaf;
node = page_node(mp, i);
}
const pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->txnid);
if (unlikely((rc = lp.err) != MDBX_SUCCESS))
goto bailout;
const size_t datasize = node_ds(node);
const pgno_t npages = largechunk_npages(ctx->env, datasize);
poke_pgno(node_data(node), ctx->first_unallocated);
rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, npages);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
} else if (node_flags(node) & N_TREE) {
if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid dupsort sub-tree node size",
(unsigned)node_ds(node));
rc = MDBX_CORRUPTED;
goto bailout;
}
/* Need writable leaf */
if (mp != leaf) {
mc->pg[mc->top] = leaf;
page_copy(leaf, mp, ctx->env->ps);
mp = leaf;
node = page_node(mp, i);
}
tree_t *nested = nullptr;
if (node_flags(node) & N_DUP) {
rc = cursor_dupsort_setup(mc, node, mp);
if (likely(rc == MDBX_SUCCESS)) {
nested = &mc->subcur->nested_tree;
rc = compacting_walk(ctx, &mc->subcur->cursor, &nested->root, mp->txnid);
}
} else {
cASSERT(mc, (mc->flags & z_inner) == 0 && mc->subcur == 0);
cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer);
nested = &couple->inner.nested_tree;
memcpy(nested, node_data(node), sizeof(tree_t));
rc = compacting_walk_tree(ctx, nested);
}
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
memcpy(node_data(node), nested, sizeof(tree_t));
}
}
}
} else {
mc->ki[mc->top]++;
if (mc->ki[mc->top] < nkeys) {
for (;;) {
const node_t *node = page_node(mp, mc->ki[mc->top]);
rc = page_get(mc, node_pgno(node), &mp, mp->txnid);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
mc->top += 1;
if (unlikely(mc->top >= deep_limit)) {
rc = MDBX_CURSOR_FULL;
goto bailout;
}
mc->ki[mc->top] = 0;
if (!is_branch(mp)) {
mc->pg[mc->top] = mp;
break;
}
/* Whenever we advance to a sibling branch page,
* we must proceed all the way down to its first leaf. */
page_copy(mc->pg[mc->top], mp, ctx->env->ps);
}
continue;
}
}
const pgno_t pgno = ctx->first_unallocated;
if (likely(!is_dupfix_leaf(mp))) {
rc = compacting_put_page(ctx, mp, PAGEHDRSZ + mp->lower, ctx->env->ps - (PAGEHDRSZ + mp->upper), 1);
} else {
rc = compacting_put_page(ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->dupfix_ksize, 0, 1);
}
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
if (mc->top) {
/* Update parent if there is one */
node_set_pgno(page_node(mc->pg[mc->top - 1], mc->ki[mc->top - 1]), pgno);
cursor_pop(mc);
} else {
/* Otherwise we're done */
*parent_pgno = pgno;
break;
}
}
bailout:
osal_free(buf);
return rc;
}
__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree) {
if (unlikely(tree->root == P_INVALID))
return MDBX_SUCCESS; /* empty db */
cursor_couple_t couple;
memset(&couple, 0, sizeof(couple));
couple.inner.cursor.signature = ~cur_signature_live;
kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}};
int rc = cursor_init4walk(&couple, ctx->txn, tree, &kvx);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
couple.outer.checking |= z_ignord | z_pagecheck;
couple.inner.cursor.checking |= z_ignord | z_pagecheck;
if (!tree->mod_txnid)
tree->mod_txnid = ctx->txn->txnid;
return compacting_walk(ctx, &couple.outer, &tree->root, tree->mod_txnid);
}
__cold static void compacting_fixup_meta(MDBX_env *env, meta_t *meta) {
eASSERT(env, meta->trees.gc.mod_txnid || meta->trees.gc.root == P_INVALID);
eASSERT(env, meta->trees.main.mod_txnid || meta->trees.main.root == P_INVALID);
/* Calculate filesize taking in account shrink/growing thresholds */
if (meta->geometry.first_unallocated != meta->geometry.now) {
meta->geometry.now = meta->geometry.first_unallocated;
const size_t aligner = pv2pages(meta->geometry.grow_pv ? meta->geometry.grow_pv : meta->geometry.shrink_pv);
if (aligner) {
const pgno_t aligned = pgno_align2os_pgno(env, meta->geometry.first_unallocated + aligner -
meta->geometry.first_unallocated % aligner);
meta->geometry.now = aligned;
}
}
if (meta->geometry.now < meta->geometry.lower)
meta->geometry.now = meta->geometry.lower;
if (meta->geometry.now > meta->geometry.upper)
meta->geometry.now = meta->geometry.upper;
/* Update signature */
assert(meta->geometry.now >= meta->geometry.first_unallocated);
meta_sign_as_steady(meta);
}
/* Make resizable */
__cold static void meta_make_sizeable(meta_t *meta) {
meta->geometry.lower = MIN_PAGENO;
if (meta->geometry.grow_pv == 0) {
const pgno_t step = 1 + (meta->geometry.upper - meta->geometry.lower) / 42;
meta->geometry.grow_pv = pages2pv(step);
}
if (meta->geometry.shrink_pv == 0) {
const pgno_t step = pv2pages(meta->geometry.grow_pv) << 1;
meta->geometry.shrink_pv = pages2pv(step);
}
}
__cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *txn, mdbx_filehandle_t fd, uint8_t *buffer,
const bool dest_is_pipe, const MDBX_copy_flags_t flags) {
const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize);
meta_t *const meta = meta_init_triplet(env, buffer);
meta_set_txnid(env, meta, txn->txnid);
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
meta_make_sizeable(meta);
/* copy canary sequences if present */
if (txn->canary.v) {
meta->canary = txn->canary;
meta->canary.v = constmeta_txnid(meta);
}
if (txn->dbs[MAIN_DBI].root == P_INVALID) {
/* When the DB is empty, handle it specially to
* fix any breakage like page leaks from ITS#8174. */
meta->trees.main.flags = txn->dbs[MAIN_DBI].flags;
compacting_fixup_meta(env, meta);
if (dest_is_pipe) {
if (flags & MDBX_CP_THROTTLE_MVCC)
mdbx_txn_park(txn, false);
int rc = osal_write(fd, buffer, meta_bytes);
if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_THROTTLE_MVCC) != 0)
rc = mdbx_txn_unpark(txn, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
} else {
/* Count free pages + GC pages. */
cursor_couple_t couple;
int rc = cursor_init(&couple.outer, txn, FREE_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
pgno_t gc_npages = txn->dbs[FREE_DBI].branch_pages + txn->dbs[FREE_DBI].leaf_pages + txn->dbs[FREE_DBI].large_pages;
MDBX_val key, data;
rc = outer_first(&couple.outer, &key, &data);
while (rc == MDBX_SUCCESS) {
const pnl_t pnl = data.iov_base;
if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(pnl))) {
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-record length", data.iov_len);
return MDBX_CORRUPTED;
}
if (unlikely(!pnl_check(pnl, txn->geo.first_unallocated))) {
ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-record content");
return MDBX_CORRUPTED;
}
gc_npages += MDBX_PNL_GETSIZE(pnl);
rc = outer_next(&couple.outer, &key, &data, MDBX_NEXT);
}
if (unlikely(rc != MDBX_NOTFOUND))
return rc;
meta->geometry.first_unallocated = txn->geo.first_unallocated - gc_npages;
meta->trees.main = txn->dbs[MAIN_DBI];
ctx_t ctx;
memset(&ctx, 0, sizeof(ctx));
rc = osal_condpair_init(&ctx.condpair);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF);
ctx.write_buf[0] = data_buffer;
ctx.write_buf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF;
ctx.first_unallocated = NUM_METAS;
ctx.env = env;
ctx.fd = fd;
ctx.txn = txn;
ctx.flags = flags;
osal_thread_t thread;
int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx);
if (likely(thread_err == MDBX_SUCCESS)) {
if (dest_is_pipe) {
if (!meta->trees.main.mod_txnid)
meta->trees.main.mod_txnid = txn->txnid;
compacting_fixup_meta(env, meta);
if (flags & MDBX_CP_THROTTLE_MVCC)
mdbx_txn_park(txn, false);
rc = osal_write(fd, buffer, meta_bytes);
if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_THROTTLE_MVCC) != 0)
rc = mdbx_txn_unpark(txn, false);
}
if (likely(rc == MDBX_SUCCESS))
rc = compacting_walk_tree(&ctx, &meta->trees.main);
if (ctx.write_len[ctx.head & 1])
/* toggle to flush non-empty buffers */
compacting_toggle_write_buffers(&ctx);
if (likely(rc == MDBX_SUCCESS) && unlikely(meta->geometry.first_unallocated != ctx.first_unallocated)) {
if (ctx.first_unallocated > meta->geometry.first_unallocated) {
ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO " %c expected %" PRIaPGNO,
"has double-used pages or other corruption", ctx.first_unallocated, '>',
meta->geometry.first_unallocated);
rc = MDBX_CORRUPTED; /* corrupted DB */
}
if (ctx.first_unallocated < meta->geometry.first_unallocated) {
WARNING("the source DB %s: post-compactification used pages %" PRIaPGNO " %c expected %" PRIaPGNO,
"has page leak(s)", ctx.first_unallocated, '<', meta->geometry.first_unallocated);
if (dest_is_pipe)
/* the root within already written meta-pages is wrong */
rc = MDBX_CORRUPTED;
}
/* fixup meta */
meta->geometry.first_unallocated = ctx.first_unallocated;
}
/* toggle with empty buffers to exit thread's loop */
eASSERT(env, (ctx.write_len[ctx.head & 1]) == 0);
compacting_toggle_write_buffers(&ctx);
thread_err = osal_thread_join(thread);
eASSERT(env, (ctx.tail == ctx.head && ctx.write_len[ctx.head & 1] == 0) || ctx.error);
osal_condpair_destroy(&ctx.condpair);
}
if (unlikely(thread_err != MDBX_SUCCESS))
return thread_err;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(ctx.error != MDBX_SUCCESS))
return ctx.error;
if (!dest_is_pipe)
compacting_fixup_meta(env, meta);
}
if (flags & MDBX_CP_THROTTLE_MVCC)
mdbx_txn_park(txn, false);
/* Extend file if required */
if (meta->geometry.now != meta->geometry.first_unallocated) {
const size_t whole_size = pgno2bytes(env, meta->geometry.now);
if (!dest_is_pipe)
return osal_ftruncate(fd, whole_size);
const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated);
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size; offset < whole_size;) {
const size_t chunk =
((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset;
int rc = osal_write(fd, data_buffer, chunk);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
offset += chunk;
}
}
return MDBX_SUCCESS;
}
//----------------------------------------------------------------------------
__cold static int copy_asis(MDBX_env *env, MDBX_txn *txn, mdbx_filehandle_t fd, uint8_t *buffer,
const bool dest_is_pipe, const MDBX_copy_flags_t flags) {
bool should_unlock = false;
if ((txn->flags & MDBX_TXN_RDONLY) != 0 && (flags & MDBX_CP_RENEW_TXN) != 0) {
/* Try temporarily block writers until we snapshot the meta pages */
int err = lck_txn_lock(env, true);
if (likely(err == MDBX_SUCCESS))
should_unlock = true;
else if (unlikely(err != MDBX_BUSY))
return err;
}
jitter4testing(false);
int rc = MDBX_SUCCESS;
const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
troika_t troika = meta_tap(env);
/* Make a snapshot of meta-pages,
* but writing ones after the data was flushed */
retry_snap_meta:
memcpy(buffer, env->dxb_mmap.base, meta_bytes);
const meta_ptr_t recent = meta_recent(env, &troika);
meta_t *headcopy = /* LY: get pointer to the snapshot copy */
ptr_disp(buffer, ptr_dist(recent.ptr_c, env->dxb_mmap.base));
jitter4testing(false);
if (txn->flags & MDBX_TXN_RDONLY) {
if (recent.txnid != txn->txnid) {
if (flags & MDBX_CP_RENEW_TXN)
rc = mdbx_txn_renew(txn);
else {
rc = MDBX_MVCC_RETARDED;
for (size_t n = 0; n < NUM_METAS; ++n) {
meta_t *const meta = page_meta(ptr_disp(buffer, pgno2bytes(env, n)));
if (troika.txnid[n] == txn->txnid && ((/* is_steady */ (troika.fsm >> n) & 1) || rc != MDBX_SUCCESS)) {
rc = MDBX_SUCCESS;
headcopy = meta;
} else if (troika.txnid[n] > txn->txnid)
meta_set_txnid(env, meta, 0);
}
}
}
if (should_unlock)
lck_txn_unlock(env);
else {
troika_t snap = meta_tap(env);
if (memcmp(&troika, &snap, sizeof(troika_t)) && rc == MDBX_SUCCESS) {
troika = snap;
goto retry_snap_meta;
}
}
}
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (txn->flags & MDBX_TXN_RDONLY)
eASSERT(env, meta_txnid(headcopy) == txn->txnid);
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
meta_make_sizeable(headcopy);
/* Update signature to steady */
meta_sign_as_steady(headcopy);
/* Copy the data */
const size_t whole_size = pgno_align2os_bytes(env, txn->geo.end_pgno);
const size_t used_size = pgno2bytes(env, txn->geo.first_unallocated);
jitter4testing(false);
if (flags & MDBX_CP_THROTTLE_MVCC)
mdbx_txn_park(txn, false);
if (dest_is_pipe)
rc = osal_write(fd, buffer, meta_bytes);
uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize);
#if MDBX_USE_COPYFILERANGE
static bool copyfilerange_unavailable;
#if (defined(__linux__) || defined(__gnu_linux__))
if (globals.linux_kernel_version >= 0x05030000 && globals.linux_kernel_version < 0x05130000)
copyfilerange_unavailable = true;
#endif /* linux */
bool not_the_same_filesystem = false;
if (!copyfilerange_unavailable) {
struct statfs statfs_info;
if (fstatfs(fd, &statfs_info) || statfs_info.f_type == /* ECRYPTFS_SUPER_MAGIC */ 0xf15f)
/* avoid use copyfilerange_unavailable() to ecryptfs due bugs */
not_the_same_filesystem = true;
}
#endif /* MDBX_USE_COPYFILERANGE */
for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) {
if (flags & MDBX_CP_THROTTLE_MVCC) {
rc = mdbx_txn_unpark(txn, false);
if (unlikely(rc != MDBX_SUCCESS))
break;
}
#if MDBX_USE_SENDFILE
static bool sendfile_unavailable;
if (dest_is_pipe && likely(!sendfile_unavailable)) {
off_t in_offset = offset;
const ssize_t written = sendfile(fd, env->lazy_fd, &in_offset, used_size - offset);
if (likely(written > 0)) {
offset = in_offset;
if (flags & MDBX_CP_THROTTLE_MVCC)
rc = mdbx_txn_park(txn, false);
continue;
}
rc = MDBX_ENODATA;
if (written == 0 || ignore_enosys_and_eagain(rc = errno) != MDBX_RESULT_TRUE)
break;
sendfile_unavailable = true;
}
#endif /* MDBX_USE_SENDFILE */
#if MDBX_USE_COPYFILERANGE
if (!dest_is_pipe && !not_the_same_filesystem && likely(!copyfilerange_unavailable)) {
off_t in_offset = offset, out_offset = offset;
ssize_t bytes_copied = copy_file_range(env->lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0);
if (likely(bytes_copied > 0)) {
offset = in_offset;
if (flags & MDBX_CP_THROTTLE_MVCC)
rc = mdbx_txn_park(txn, false);
continue;
}
rc = MDBX_ENODATA;
if (bytes_copied == 0)
break;
rc = errno;
if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s),
maybe useful for others FS */
EINVAL)
not_the_same_filesystem = true;
else if (ignore_enosys_and_eagain(rc) == MDBX_RESULT_TRUE)
copyfilerange_unavailable = true;
else
break;
}
#endif /* MDBX_USE_COPYFILERANGE */
/* fallback to portable */
const size_t chunk =
((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : used_size - offset;
/* copy to avoid EFAULT in case swapped-out */
memcpy(data_buffer, ptr_disp(env->dxb_mmap.base, offset), chunk);
if (flags & MDBX_CP_THROTTLE_MVCC)
mdbx_txn_park(txn, false);
rc = osal_write(fd, data_buffer, chunk);
offset += chunk;
}
/* Extend file if required */
if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) {
if (!dest_is_pipe)
rc = osal_ftruncate(fd, whole_size);
else {
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) {
const size_t chunk =
((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset;
rc = osal_write(fd, data_buffer, chunk);
offset += chunk;
}
}
}
return rc;
}
//----------------------------------------------------------------------------
__cold static int copy2fd(MDBX_txn *txn, mdbx_filehandle_t fd, MDBX_copy_flags_t flags) {
if (unlikely(txn->flags & MDBX_TXN_DIRTY))
return MDBX_BAD_TXN;
int rc = MDBX_SUCCESS;
if (txn->flags & MDBX_TXN_RDONLY) {
if (flags & MDBX_CP_THROTTLE_MVCC) {
rc = mdbx_txn_park(txn, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
} else if (unlikely(flags & (MDBX_CP_THROTTLE_MVCC | MDBX_CP_RENEW_TXN)))
return MDBX_EINVAL;
const int dest_is_pipe = osal_is_pipe(fd);
if (MDBX_IS_ERROR(dest_is_pipe))
return dest_is_pipe;
if (!dest_is_pipe) {
rc = osal_fseek(fd, 0);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
MDBX_env *const env = txn->env;
const size_t buffer_size =
pgno_align2os_bytes(env, NUM_METAS) +
ceil_powerof2(((flags & MDBX_CP_COMPACT) ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF : (size_t)MDBX_ENVCOPY_WRITEBUF),
globals.sys_pagesize);
uint8_t *buffer = nullptr;
rc = osal_memalign_alloc(globals.sys_pagesize, buffer_size, (void **)&buffer);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (!dest_is_pipe) {
/* Firstly write a stub to meta-pages.
* Now we sure to incomplete copy will not be used. */
memset(buffer, -1, pgno2bytes(env, NUM_METAS));
rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS));
}
if (likely(rc == MDBX_SUCCESS))
rc = mdbx_txn_unpark(txn, false);
if (likely(rc == MDBX_SUCCESS)) {
memset(buffer, 0, pgno2bytes(env, NUM_METAS));
rc = ((flags & MDBX_CP_COMPACT) ? copy_with_compacting : copy_asis)(env, txn, fd, buffer, dest_is_pipe, flags);
if (likely(rc == MDBX_SUCCESS))
rc = mdbx_txn_unpark(txn, false);
}
if (txn->flags & MDBX_TXN_RDONLY) {
if (flags & MDBX_CP_THROTTLE_MVCC)
mdbx_txn_park(txn, true);
else if (flags & MDBX_CP_DISPOSE_TXN)
mdbx_txn_reset(txn);
}
if (!dest_is_pipe) {
if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_DONT_FLUSH) == 0)
rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
/* Write actual meta */
if (likely(rc == MDBX_SUCCESS))
rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_DONT_FLUSH) == 0)
rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
}
osal_memalign_free(buffer);
return rc;
}
__cold static int copy2pathname(MDBX_txn *txn, const pathchar_t *dest_path, MDBX_copy_flags_t flags) {
if (unlikely(!dest_path || *dest_path == '\0'))
return MDBX_EINVAL;
/* The destination path must exist, but the destination file must not.
* We don't want the OS to cache the writes, since the source data is
* already in the OS cache. */
mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE;
int rc = osal_openfile(MDBX_OPEN_COPY, txn->env, dest_path, &newfd,
#if defined(_WIN32) || defined(_WIN64)
(mdbx_mode_t)-1
#else
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP
#endif
);
#if defined(_WIN32) || defined(_WIN64)
/* no locking required since the file opened with ShareMode == 0 */
#else
if (rc == MDBX_SUCCESS) {
MDBX_STRUCT_FLOCK lock_op;
memset(&lock_op, 0, sizeof(lock_op));
lock_op.l_type = F_WRLCK;
lock_op.l_whence = SEEK_SET;
lock_op.l_start = 0;
lock_op.l_len = OFF_T_MAX;
if (MDBX_FCNTL(newfd, MDBX_F_SETLK, &lock_op))
rc = errno;
}
#if defined(LOCK_EX) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24)
if (rc == MDBX_SUCCESS && flock(newfd, LOCK_EX | LOCK_NB)) {
const int err_flock = errno, err_fs = osal_check_fs_local(newfd, 0);
if (err_flock != EAGAIN || err_fs != MDBX_EREMOTE) {
ERROR("%s flock(%" MDBX_PRIsPATH ") error %d, remote-fs check status %d", "unexpected", dest_path, err_flock,
err_fs);
rc = err_flock;
} else {
WARNING("%s flock(%" MDBX_PRIsPATH ") error %d, remote-fs check status %d", "ignore", dest_path, err_flock,
err_fs);
}
}
#endif /* LOCK_EX && ANDROID_API >= 24 */
#endif /* Windows / POSIX */
if (rc == MDBX_SUCCESS)
rc = copy2fd(txn, newfd, flags);
if (newfd != INVALID_HANDLE_VALUE) {
int err = osal_closefile(newfd);
if (rc == MDBX_SUCCESS && err != rc)
rc = err;
if (rc != MDBX_SUCCESS)
(void)osal_removefile(dest_path);
}
return rc;
}
//----------------------------------------------------------------------------
__cold int mdbx_txn_copy2fd(MDBX_txn *txn, mdbx_filehandle_t fd, MDBX_copy_flags_t flags) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (likely(rc == MDBX_SUCCESS))
rc = copy2fd(txn, fd, flags);
if (flags & MDBX_CP_DISPOSE_TXN)
mdbx_txn_abort(txn);
return LOG_IFERR(rc);
}
__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, MDBX_copy_flags_t flags) {
if (unlikely(flags & (MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN)))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
MDBX_txn *txn = nullptr;
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = copy2fd(txn, fd, flags | MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN);
mdbx_txn_abort(txn);
return LOG_IFERR(rc);
}
__cold int mdbx_txn_copy2pathname(MDBX_txn *txn, const char *dest_path, MDBX_copy_flags_t flags) {
#if defined(_WIN32) || defined(_WIN64)
wchar_t *dest_pathW = nullptr;
int rc = osal_mb2w(dest_path, &dest_pathW);
if (likely(rc == MDBX_SUCCESS)) {
rc = mdbx_txn_copy2pathnameW(txn, dest_pathW, flags);
osal_free(dest_pathW);
}
return LOG_IFERR(rc);
}
__cold int mdbx_txn_copy2pathnameW(MDBX_txn *txn, const wchar_t *dest_path, MDBX_copy_flags_t flags) {
#endif /* Windows */
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (likely(rc == MDBX_SUCCESS))
rc = copy2pathname(txn, dest_path, flags);
if (flags & MDBX_CP_DISPOSE_TXN)
mdbx_txn_abort(txn);
return LOG_IFERR(rc);
}
__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) {
#if defined(_WIN32) || defined(_WIN64)
wchar_t *dest_pathW = nullptr;
int rc = osal_mb2w(dest_path, &dest_pathW);
if (likely(rc == MDBX_SUCCESS)) {
rc = mdbx_env_copyW(env, dest_pathW, flags);
osal_free(dest_pathW);
}
return LOG_IFERR(rc);
}
__cold int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, MDBX_copy_flags_t flags) {
#endif /* Windows */
if (unlikely(flags & (MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN)))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
MDBX_txn *txn = nullptr;
rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = copy2pathname(txn, dest_path, flags | MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN);
mdbx_txn_abort(txn);
return LOG_IFERR(rc);
}

View File

@ -1,755 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
MDBX_cursor *mdbx_cursor_create(void *context) {
cursor_couple_t *couple = osal_calloc(1, sizeof(cursor_couple_t));
if (unlikely(!couple))
return nullptr;
VALGRIND_MAKE_MEM_UNDEFINED(couple, sizeof(cursor_couple_t));
couple->outer.signature = cur_signature_ready4dispose;
couple->outer.next = &couple->outer;
couple->userctx = context;
cursor_reset(couple);
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.backup, sizeof(couple->outer.backup));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.tree, sizeof(couple->outer.tree));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.clc, sizeof(couple->outer.clc));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.dbi_state, sizeof(couple->outer.dbi_state));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.subcur, sizeof(couple->outer.subcur));
VALGRIND_MAKE_MEM_DEFINED(&couple->outer.txn, sizeof(couple->outer.txn));
return &couple->outer;
}
int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) {
return likely(mc) ? mdbx_cursor_bind(txn, mc, (kvx_t *)mc->clc - txn->env->kvs) : LOG_IFERR(MDBX_EINVAL);
}
int mdbx_cursor_reset(MDBX_cursor *mc) {
int rc = cursor_check(mc, MDBX_TXN_FINISHED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_reset((cursor_couple_t *)mc);
return MDBX_SUCCESS;
}
int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) {
if (unlikely(!mc))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(mc->signature != cur_signature_ready4dispose && mc->signature != cur_signature_live)) {
int rc = (mc->signature == cur_signature_wait4eot) ? MDBX_EINVAL : MDBX_EBADSIGN;
return LOG_IFERR(rc);
}
int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(dbi == FREE_DBI && !(txn->flags & MDBX_TXN_RDONLY)))
return LOG_IFERR(MDBX_EACCESS);
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(mc->backup)) /* Cursor from parent transaction */
LOG_IFERR(MDBX_EINVAL);
if (mc->signature == cur_signature_live) {
if (mc->txn == txn && cursor_dbi(mc) == dbi)
return MDBX_SUCCESS;
rc = mdbx_cursor_unbind(mc);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_BAD_TXN) ? MDBX_EINVAL : rc;
}
cASSERT(mc, mc->next == mc);
rc = cursor_init(mc, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
mc->next = txn->cursors[dbi];
txn->cursors[dbi] = mc;
txn->flags |= txn_may_have_cursors;
return MDBX_SUCCESS;
}
int mdbx_cursor_unbind(MDBX_cursor *mc) {
if (unlikely(!mc))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(mc->signature != cur_signature_live))
return (mc->signature == cur_signature_ready4dispose) ? MDBX_SUCCESS : LOG_IFERR(MDBX_EBADSIGN);
if (unlikely(mc->backup)) /* Cursor from parent transaction */
/* TODO: реализовать при переходе на двусвязный список курсоров */
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn(mc->txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD);
if (unlikely(rc != MDBX_SUCCESS)) {
for (const MDBX_txn *txn = mc->txn; rc == MDBX_BAD_TXN && check_txn(txn, MDBX_TXN_FINISHED) == MDBX_SUCCESS;
txn = txn->nested)
if (dbi_state(txn, cursor_dbi(mc)) == 0)
/* специальный случай: курсор прикреплён к родительской транзакции, но соответствующий dbi-дескриптор ещё
* не использовался во вложенной транзакции, т.е. курсор ещё не импортирован в дочернюю транзакцию и не имеет
* связанного сохранённого состояния (поэтому mcbackup равен nullptr). */
rc = MDBX_EINVAL;
return LOG_IFERR(rc);
}
if (unlikely(!mc->txn || mc->txn->signature != txn_signature)) {
ERROR("Wrong cursor's transaction %p 0x%x", __Wpedantic_format_voidptr(mc->txn), mc->txn ? mc->txn->signature : 0);
return LOG_IFERR(MDBX_PROBLEM);
}
if (mc->next != mc) {
const size_t dbi = cursor_dbi(mc);
cASSERT(mc, dbi < mc->txn->n_dbi);
cASSERT(mc, &mc->txn->env->kvs[dbi].clc == mc->clc);
if (dbi < mc->txn->n_dbi) {
MDBX_cursor **prev = &mc->txn->cursors[dbi];
while (/* *prev && */ *prev != mc) {
ENSURE(mc->txn->env, (*prev)->signature == cur_signature_live || (*prev)->signature == cur_signature_wait4eot);
prev = &(*prev)->next;
}
cASSERT(mc, *prev == mc);
*prev = mc->next;
}
mc->next = mc;
}
cursor_drown((cursor_couple_t *)mc);
mc->signature = cur_signature_ready4dispose;
return MDBX_SUCCESS;
}
int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) {
if (unlikely(!ret))
return LOG_IFERR(MDBX_EINVAL);
*ret = nullptr;
MDBX_cursor *const mc = mdbx_cursor_create(nullptr);
if (unlikely(!mc))
return LOG_IFERR(MDBX_ENOMEM);
int rc = mdbx_cursor_bind(txn, mc, dbi);
if (unlikely(rc != MDBX_SUCCESS)) {
mdbx_cursor_close(mc);
return LOG_IFERR(rc);
}
*ret = mc;
return MDBX_SUCCESS;
}
void mdbx_cursor_close(MDBX_cursor *cursor) {
if (likely(cursor)) {
int err = mdbx_cursor_close2(cursor);
if (unlikely(err != MDBX_SUCCESS))
mdbx_panic("%s:%d error %d (%s) while closing cursor", __func__, __LINE__, err, mdbx_liberr2str(err));
}
}
int mdbx_cursor_close2(MDBX_cursor *mc) {
if (unlikely(!mc))
return LOG_IFERR(MDBX_EINVAL);
if (mc->signature == cur_signature_ready4dispose) {
if (unlikely(mc->txn || mc->backup))
return LOG_IFERR(MDBX_PANIC);
cursor_drown((cursor_couple_t *)mc);
mc->signature = 0;
osal_free(mc);
return MDBX_SUCCESS;
}
if (unlikely(mc->signature != cur_signature_live))
return LOG_IFERR(MDBX_EBADSIGN);
MDBX_txn *const txn = mc->txn;
int rc = check_txn(txn, MDBX_TXN_FINISHED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (mc->backup) {
/* Cursor closed before nested txn ends */
cursor_reset((cursor_couple_t *)mc);
mc->signature = cur_signature_wait4eot;
return MDBX_SUCCESS;
}
if (mc->next != mc) {
const size_t dbi = cursor_dbi(mc);
cASSERT(mc, dbi < mc->txn->n_dbi);
cASSERT(mc, &mc->txn->env->kvs[dbi].clc == mc->clc);
if (likely(dbi < txn->n_dbi)) {
MDBX_cursor **prev = &txn->cursors[dbi];
while (/* *prev && */ *prev != mc) {
ENSURE(txn->env, (*prev)->signature == cur_signature_live || (*prev)->signature == cur_signature_wait4eot);
prev = &(*prev)->next;
}
tASSERT(txn, *prev == mc);
*prev = mc->next;
}
mc->next = mc;
}
cursor_drown((cursor_couple_t *)mc);
mc->signature = 0;
osal_free(mc);
return MDBX_SUCCESS;
}
int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) {
int rc = cursor_check(src, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = mdbx_cursor_bind(src->txn, dest, cursor_dbi(src));
if (unlikely(rc != MDBX_SUCCESS))
return rc;
assert(dest->tree == src->tree);
assert(cursor_dbi(dest) == cursor_dbi(src));
again:
assert(dest->clc == src->clc);
assert(dest->txn == src->txn);
dest->top_and_flags = src->top_and_flags;
for (intptr_t i = 0; i <= src->top; ++i) {
dest->ki[i] = src->ki[i];
dest->pg[i] = src->pg[i];
}
if (src->subcur) {
dest->subcur->nested_tree = src->subcur->nested_tree;
src = &src->subcur->cursor;
dest = &dest->subcur->cursor;
goto again;
}
return MDBX_SUCCESS;
}
int mdbx_txn_release_all_cursors_ex(const MDBX_txn *txn, bool unbind, size_t *count) {
int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
size_t n = 0;
do {
TXN_FOREACH_DBI_FROM(txn, i, MAIN_DBI) {
MDBX_cursor *mc = txn->cursors[i], *next = nullptr;
if (mc) {
txn->cursors[i] = nullptr;
do {
next = mc->next;
if (mc->signature == cur_signature_live) {
mc->signature = cur_signature_wait4eot;
cursor_drown((cursor_couple_t *)mc);
} else
ENSURE(nullptr, mc->signature == cur_signature_wait4eot);
if (mc->backup) {
MDBX_cursor *bk = mc->backup;
mc->next = bk->next;
mc->backup = bk->backup;
bk->backup = nullptr;
bk->signature = 0;
osal_free(bk);
} else {
mc->signature = cur_signature_ready4dispose;
mc->next = mc;
++n;
if (!unbind) {
mc->signature = 0;
osal_free(mc);
}
}
} while ((mc = next) != nullptr);
}
}
txn = txn->parent;
} while (txn);
if (count)
*count = n;
return MDBX_SUCCESS;
}
int mdbx_cursor_compare(const MDBX_cursor *l, const MDBX_cursor *r, bool ignore_multival) {
const int incomparable = INT16_MAX + 1;
if (unlikely(!l))
return r ? -incomparable * 9 : 0;
else if (unlikely(!r))
return incomparable * 9;
if (unlikely(cursor_check_pure(l) != MDBX_SUCCESS))
return (cursor_check_pure(r) == MDBX_SUCCESS) ? -incomparable * 8 : 0;
if (unlikely(cursor_check_pure(r) != MDBX_SUCCESS))
return (cursor_check_pure(l) == MDBX_SUCCESS) ? incomparable * 8 : 0;
if (unlikely(l->clc != r->clc)) {
if (l->txn->env != r->txn->env)
return (l->txn->env > r->txn->env) ? incomparable * 7 : -incomparable * 7;
if (l->txn->txnid != r->txn->txnid)
return (l->txn->txnid > r->txn->txnid) ? incomparable * 6 : -incomparable * 6;
return (l->clc > r->clc) ? incomparable * 5 : -incomparable * 5;
}
assert(cursor_dbi(l) == cursor_dbi(r));
int diff = is_pointed(l) - is_pointed(r);
if (unlikely(diff))
return (diff > 0) ? incomparable * 4 : -incomparable * 4;
if (unlikely(!is_pointed(l)))
return 0;
intptr_t detent = (l->top <= r->top) ? l->top : r->top;
for (intptr_t i = 0; i <= detent; ++i) {
diff = l->ki[i] - r->ki[i];
if (diff)
return diff;
}
if (unlikely(l->top != r->top))
return (l->top > r->top) ? incomparable * 3 : -incomparable * 3;
assert((l->subcur != nullptr) == (r->subcur != nullptr));
if (unlikely((l->subcur != nullptr) != (r->subcur != nullptr)))
return l->subcur ? incomparable * 2 : -incomparable * 2;
if (ignore_multival || !l->subcur)
return 0;
#if MDBX_DEBUG
if (is_pointed(&l->subcur->cursor)) {
const page_t *mp = l->pg[l->top];
const node_t *node = page_node(mp, l->ki[l->top]);
assert(node_flags(node) & N_DUP);
}
if (is_pointed(&r->subcur->cursor)) {
const page_t *mp = r->pg[r->top];
const node_t *node = page_node(mp, r->ki[r->top]);
assert(node_flags(node) & N_DUP);
}
#endif /* MDBX_DEBUG */
l = &l->subcur->cursor;
r = &r->subcur->cursor;
diff = is_pointed(l) - is_pointed(r);
if (unlikely(diff))
return (diff > 0) ? incomparable * 2 : -incomparable * 2;
if (unlikely(!is_pointed(l)))
return 0;
detent = (l->top <= r->top) ? l->top : r->top;
for (intptr_t i = 0; i <= detent; ++i) {
diff = l->ki[i] - r->ki[i];
if (diff)
return diff;
}
if (unlikely(l->top != r->top))
return (l->top > r->top) ? incomparable : -incomparable;
return (l->flags & z_eof_hard) - (r->flags & z_eof_hard);
}
int mdbx_cursor_count_ex(const MDBX_cursor *mc, size_t *count, MDBX_stat *ns, size_t bytes) {
int rc = cursor_check_ro(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (ns) {
const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
return LOG_IFERR(MDBX_EINVAL);
memset(ns, 0, sizeof(*ns));
}
size_t nvals = 0;
if (is_filled(mc)) {
nvals = 1;
if (!inner_hollow(mc)) {
const page_t *mp = mc->pg[mc->top];
const node_t *node = page_node(mp, mc->ki[mc->top]);
cASSERT(mc, node_flags(node) & N_DUP);
const tree_t *nt = &mc->subcur->nested_tree;
nvals = unlikely(nt->items > PTRDIFF_MAX) ? PTRDIFF_MAX : (size_t)nt->items;
if (ns) {
ns->ms_psize = (unsigned)node_ds(node);
if (node_flags(node) & N_TREE) {
ns->ms_psize = mc->txn->env->ps;
ns->ms_depth = nt->height;
ns->ms_branch_pages = nt->branch_pages;
}
cASSERT(mc, nt->large_pages == 0);
ns->ms_leaf_pages = nt->leaf_pages;
ns->ms_entries = nt->items;
if (likely(bytes >= offsetof(MDBX_stat, ms_mod_txnid) + sizeof(ns->ms_mod_txnid)))
ns->ms_mod_txnid = nt->mod_txnid;
}
}
}
if (likely(count))
*count = nvals;
return MDBX_SUCCESS;
}
int mdbx_cursor_count(const MDBX_cursor *mc, size_t *count) {
if (unlikely(count == nullptr))
return LOG_IFERR(MDBX_EINVAL);
return mdbx_cursor_count_ex(mc, count, nullptr, 0);
}
int mdbx_cursor_on_first(const MDBX_cursor *mc) {
int rc = cursor_check_pure(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
for (intptr_t i = 0; i <= mc->top; ++i) {
if (mc->ki[i])
return MDBX_RESULT_FALSE;
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_on_first_dup(const MDBX_cursor *mc) {
int rc = cursor_check_pure(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (is_filled(mc) && mc->subcur) {
mc = &mc->subcur->cursor;
for (intptr_t i = 0; i <= mc->top; ++i) {
if (mc->ki[i])
return MDBX_RESULT_FALSE;
}
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_on_last(const MDBX_cursor *mc) {
int rc = cursor_check_pure(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
for (intptr_t i = 0; i <= mc->top; ++i) {
size_t nkeys = page_numkeys(mc->pg[i]);
if (mc->ki[i] < nkeys - 1)
return MDBX_RESULT_FALSE;
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_on_last_dup(const MDBX_cursor *mc) {
int rc = cursor_check_pure(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (is_filled(mc) && mc->subcur) {
mc = &mc->subcur->cursor;
for (intptr_t i = 0; i <= mc->top; ++i) {
size_t nkeys = page_numkeys(mc->pg[i]);
if (mc->ki[i] < nkeys - 1)
return MDBX_RESULT_FALSE;
}
}
return MDBX_RESULT_TRUE;
}
int mdbx_cursor_eof(const MDBX_cursor *mc) {
int rc = cursor_check_pure(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
return is_eof(mc) ? MDBX_RESULT_TRUE : MDBX_RESULT_FALSE;
}
int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) {
int rc = cursor_check_ro(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
return LOG_IFERR(cursor_ops(mc, key, data, op));
}
__hot static int scan_confinue(MDBX_cursor *mc, MDBX_predicate_func *predicate, void *context, void *arg, MDBX_val *key,
MDBX_val *value, MDBX_cursor_op turn_op) {
int rc;
switch (turn_op) {
case MDBX_NEXT:
case MDBX_NEXT_NODUP:
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = outer_next(mc, key, value, turn_op);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
case MDBX_PREV:
case MDBX_PREV_NODUP:
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = outer_prev(mc, key, value, turn_op);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
case MDBX_NEXT_DUP:
if (mc->subcur)
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = inner_next(&mc->subcur->cursor, value);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
return MDBX_NOTFOUND;
case MDBX_PREV_DUP:
if (mc->subcur)
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = inner_prev(&mc->subcur->cursor, value);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
return MDBX_NOTFOUND;
default:
for (;;) {
rc = predicate(context, key, value, arg);
if (rc != MDBX_RESULT_FALSE)
return rc;
rc = cursor_ops(mc, key, value, turn_op);
if (unlikely(rc != MDBX_SUCCESS))
return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc;
}
}
}
int mdbx_cursor_scan(MDBX_cursor *mc, MDBX_predicate_func *predicate, void *context, MDBX_cursor_op start_op,
MDBX_cursor_op turn_op, void *arg) {
if (unlikely(!predicate))
return LOG_IFERR(MDBX_EINVAL);
const unsigned valid_start_mask = 1 << MDBX_FIRST | 1 << MDBX_FIRST_DUP | 1 << MDBX_LAST | 1 << MDBX_LAST_DUP |
1 << MDBX_GET_CURRENT | 1 << MDBX_GET_MULTIPLE;
if (unlikely(start_op > 30 || ((1 << start_op) & valid_start_mask) == 0))
return LOG_IFERR(MDBX_EINVAL);
const unsigned valid_turn_mask = 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | 1 << MDBX_PREV |
1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | 1 << MDBX_NEXT_MULTIPLE |
1 << MDBX_PREV_MULTIPLE;
if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0))
return LOG_IFERR(MDBX_EINVAL);
MDBX_val key = {nullptr, 0}, value = {nullptr, 0};
int rc = mdbx_cursor_get(mc, &key, &value, start_op);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
return LOG_IFERR(scan_confinue(mc, predicate, context, arg, &key, &value, turn_op));
}
int mdbx_cursor_scan_from(MDBX_cursor *mc, MDBX_predicate_func *predicate, void *context, MDBX_cursor_op from_op,
MDBX_val *key, MDBX_val *value, MDBX_cursor_op turn_op, void *arg) {
if (unlikely(!predicate || !key))
return LOG_IFERR(MDBX_EINVAL);
const unsigned valid_start_mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY |
1 << MDBX_GET_MULTIPLE | 1 << MDBX_SET_LOWERBOUND | 1 << MDBX_SET_UPPERBOUND;
if (unlikely(from_op < MDBX_TO_KEY_LESSER_THAN && ((1 << from_op) & valid_start_mask) == 0))
return LOG_IFERR(MDBX_EINVAL);
const unsigned valid_turn_mask = 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | 1 << MDBX_PREV |
1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | 1 << MDBX_NEXT_MULTIPLE |
1 << MDBX_PREV_MULTIPLE;
if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0))
return LOG_IFERR(MDBX_EINVAL);
int rc = mdbx_cursor_get(mc, key, value, from_op);
if (unlikely(MDBX_IS_ERROR(rc)))
return LOG_IFERR(rc);
cASSERT(mc, key != nullptr);
MDBX_val stub;
if (!value) {
value = &stub;
rc = cursor_ops(mc, key, value, MDBX_GET_CURRENT);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
return LOG_IFERR(scan_confinue(mc, predicate, context, arg, key, value, turn_op));
}
int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, size_t limit, MDBX_cursor_op op) {
if (unlikely(!count))
return LOG_IFERR(MDBX_EINVAL);
*count = 0;
if (unlikely(limit < 4 || limit > INTPTR_MAX - 2))
return LOG_IFERR(MDBX_EINVAL);
int rc = cursor_check_ro(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(mc->subcur))
return LOG_IFERR(MDBX_INCOMPATIBLE) /* must be a non-dupsort table */;
switch (op) {
case MDBX_NEXT:
if (unlikely(is_eof(mc)))
return LOG_IFERR(is_pointed(mc) ? MDBX_NOTFOUND : MDBX_ENODATA);
break;
case MDBX_FIRST:
if (!is_filled(mc)) {
rc = outer_first(mc, nullptr, nullptr);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
break;
default:
DEBUG("unhandled/unimplemented cursor operation %u", op);
return LOG_IFERR(MDBX_EINVAL);
}
const page_t *mp = mc->pg[mc->top];
size_t nkeys = page_numkeys(mp);
size_t ki = mc->ki[mc->top];
size_t n = 0;
while (n + 2 <= limit) {
cASSERT(mc, ki < nkeys);
if (unlikely(ki >= nkeys))
goto sibling;
const node_t *leaf = page_node(mp, ki);
pairs[n] = get_key(leaf);
rc = node_read(mc, leaf, &pairs[n + 1], mp);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
n += 2;
if (++ki == nkeys) {
sibling:
rc = cursor_sibling_right(mc);
if (rc != MDBX_SUCCESS) {
if (rc == MDBX_NOTFOUND)
rc = MDBX_RESULT_TRUE;
goto bailout;
}
mp = mc->pg[mc->top];
DEBUG("next page is %" PRIaPGNO ", key index %u", mp->pgno, mc->ki[mc->top]);
if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) {
ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", mp->pgno, mp->flags);
rc = MDBX_CORRUPTED;
goto bailout;
}
nkeys = page_numkeys(mp);
ki = 0;
}
}
mc->ki[mc->top] = (indx_t)ki;
bailout:
*count = n;
return LOG_IFERR(rc);
}
/*----------------------------------------------------------------------------*/
int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) {
int rc = cursor_check(mc, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer);
couple->userctx = ctx;
return MDBX_SUCCESS;
}
void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) {
if (unlikely(!mc))
return nullptr;
if (unlikely(mc->signature != cur_signature_ready4dispose && mc->signature != cur_signature_live))
return nullptr;
cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer);
return couple->userctx;
}
MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) {
if (unlikely(!mc || mc->signature != cur_signature_live))
return nullptr;
MDBX_txn *txn = mc->txn;
if (unlikely(!txn || txn->signature != txn_signature || (txn->flags & MDBX_TXN_FINISHED)))
return nullptr;
return (txn->flags & MDBX_TXN_HAS_CHILD) ? txn->env->txn : txn;
}
MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) {
if (unlikely(!mc || mc->signature != cur_signature_live))
return UINT_MAX;
return cursor_dbi(mc);
}
/*----------------------------------------------------------------------------*/
int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, MDBX_put_flags_t flags) {
if (unlikely(key == nullptr || data == nullptr))
return LOG_IFERR(MDBX_EINVAL);
int rc = cursor_check_rw(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(flags & MDBX_MULTIPLE)) {
rc = cursor_check_multiple(mc, key, data, flags);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
if (flags & MDBX_RESERVE) {
if (unlikely(mc->tree->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_INTEGERDUP | MDBX_DUPFIXED)))
return LOG_IFERR(MDBX_INCOMPATIBLE);
data->iov_base = nullptr;
}
return LOG_IFERR(cursor_put_checklen(mc, key, data, flags));
}
int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
int rc = cursor_check_rw(mc);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
return LOG_IFERR(cursor_del(mc, flags));
}
__cold int mdbx_cursor_ignord(MDBX_cursor *mc) {
int rc = cursor_check(mc, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
mc->checking |= z_ignord;
if (mc->subcur)
mc->subcur->cursor.checking |= z_ignord;
return MDBX_SUCCESS;
}

View File

@ -1,315 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi) {
return LOG_IFERR(dbi_open(txn, name, flags, dbi, nullptr, nullptr));
}
int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp) {
return LOG_IFERR(dbi_open(txn, name, flags, dbi, keycmp, datacmp));
}
static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, MDBX_db_flags_t flags, MDBX_dbi *dbi,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
MDBX_val thunk, *name;
if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || name_cstr == MDBX_CHK_META)
name = (void *)name_cstr;
else {
thunk.iov_len = strlen(name_cstr);
thunk.iov_base = (void *)name_cstr;
name = &thunk;
}
return dbi_open(txn, name, flags, dbi, keycmp, datacmp);
}
int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi) {
return LOG_IFERR(dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr));
}
int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp) {
return LOG_IFERR(dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp));
}
__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (txn->dbs[dbi].height) {
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
rc = tree_drop(&cx.outer, dbi == MAIN_DBI || (cx.outer.tree->flags & MDBX_DUPSORT));
txn->cursors[dbi] = cx.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
/* Invalidate the dropped DB's cursors */
for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next)
be_poor(mc);
if (!del || dbi < CORE_DBS) {
/* reset the DB record, mark it dirty */
txn->dbi_state[dbi] |= DBI_DIRTY;
txn->dbs[dbi].height = 0;
txn->dbs[dbi].branch_pages = 0;
txn->dbs[dbi].leaf_pages = 0;
txn->dbs[dbi].large_pages = 0;
txn->dbs[dbi].items = 0;
txn->dbs[dbi].root = P_INVALID;
txn->dbs[dbi].sequence = 0;
/* txn->dbs[dbi].mod_txnid = txn->txnid; */
txn->flags |= MDBX_TXN_DIRTY;
return MDBX_SUCCESS;
}
MDBX_env *const env = txn->env;
MDBX_val name = env->kvs[dbi].name;
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (likely(rc == MDBX_SUCCESS)) {
rc = cursor_seek(&cx.outer, &name, nullptr, MDBX_SET).err;
if (likely(rc == MDBX_SUCCESS)) {
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
rc = cursor_del(&cx.outer, N_TREE);
txn->cursors[MAIN_DBI] = cx.outer.next;
if (likely(rc == MDBX_SUCCESS)) {
tASSERT(txn, txn->dbi_state[MAIN_DBI] & DBI_DIRTY);
tASSERT(txn, txn->flags & MDBX_TXN_DIRTY);
txn->dbi_state[dbi] = DBI_LINDO | DBI_OLDEN;
rc = osal_fastmutex_acquire(&env->dbi_lock);
if (likely(rc == MDBX_SUCCESS))
return LOG_IFERR(dbi_close_release(env, dbi));
}
}
}
txn->flags |= MDBX_TXN_ERROR;
return LOG_IFERR(rc);
}
__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) {
MDBX_val thunk, *name;
if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || name_cstr == MDBX_CHK_META)
name = (void *)name_cstr;
else {
thunk.iov_len = strlen(name_cstr);
thunk.iov_base = (void *)name_cstr;
name = &thunk;
}
return mdbx_dbi_rename2(txn, dbi, name);
}
__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *new_name) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(new_name == MDBX_CHK_MAIN || new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC ||
new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META || new_name->iov_base == MDBX_CHK_META))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(dbi < CORE_DBS))
return LOG_IFERR(MDBX_EINVAL);
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = osal_fastmutex_acquire(&txn->env->dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name);
if (pair.defer)
pair.defer->next = nullptr;
dbi_defer_release(txn->env, pair.defer);
rc = pair.err;
}
return LOG_IFERR(rc);
}
int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(dbi < CORE_DBS))
return (dbi == MAIN_DBI) ? MDBX_SUCCESS : LOG_IFERR(MDBX_BAD_DBI);
if (unlikely(dbi >= env->max_dbi))
return LOG_IFERR(MDBX_BAD_DBI);
rc = osal_fastmutex_acquire(&env->dbi_lock);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(dbi >= env->n_dbi)) {
rc = MDBX_BAD_DBI;
bailout:
osal_fastmutex_release(&env->dbi_lock);
return LOG_IFERR(rc);
}
while (env->basal_txn && (env->dbs_flags[dbi] & DB_VALID) && (env->basal_txn->flags & MDBX_TXN_FINISHED) == 0) {
/* LY: Опасный код, так как env->txn может быть изменено в другом потоке.
* К сожалению тут нет надежного решения и может быть падение при неверном
* использовании API (вызове mdbx_dbi_close конкурентно с завершением
* пишущей транзакции).
*
* Для минимизации вероятности падения сначала проверяем dbi-флаги
* в basal_txn, а уже после в env->txn. Таким образом, падение может быть
* только при коллизии с завершением вложенной транзакции.
*
* Альтернативно можно попробовать выполнять обновление/put записи в
* mainDb соответствующей таблице закрываемого хендла. Семантически это
* верный путь, но проблема в текущем API, в котором исторически dbi-хендл
* живет и закрывается вне транзакции. Причем проблема не только в том,
* что нет указателя на текущую пишущую транзакцию, а в том что
* пользователь точно не ожидает что закрытие хендла приведет к
* скрытой/непрозрачной активности внутри транзакции потенциально
* выполняемой в другом потоке. Другими словами, проблема может быть
* только при неверном использовании API и если пользователь это
* допускает, то точно не будет ожидать скрытых действий внутри
* транзакции, и поэтому этот путь потенциально более опасен. */
const MDBX_txn *const hazard = env->txn;
osal_compiler_barrier();
if ((dbi_state(env->basal_txn, dbi) & (DBI_LINDO | DBI_DIRTY | DBI_CREAT)) > DBI_LINDO) {
rc = MDBX_DANGLING_DBI;
goto bailout;
}
osal_memory_barrier();
if (unlikely(hazard != env->txn))
continue;
if (hazard != env->basal_txn && hazard && (hazard->flags & MDBX_TXN_FINISHED) == 0 &&
hazard->signature == txn_signature &&
(dbi_state(hazard, dbi) & (DBI_LINDO | DBI_DIRTY | DBI_CREAT)) > DBI_LINDO) {
rc = MDBX_DANGLING_DBI;
goto bailout;
}
osal_compiler_barrier();
if (likely(hazard == env->txn))
break;
}
rc = dbi_close_release(env, dbi);
return LOG_IFERR(rc);
}
int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, unsigned *state) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR - MDBX_TXN_PARKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!flags || !state))
return LOG_IFERR(MDBX_EINVAL);
*flags = txn->dbs[dbi].flags & DB_PERSISTENT_FLAGS;
*state = txn->dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE);
return MDBX_SUCCESS;
}
static void stat_get(const tree_t *db, MDBX_stat *st, size_t bytes) {
st->ms_depth = db->height;
st->ms_branch_pages = db->branch_pages;
st->ms_leaf_pages = db->leaf_pages;
st->ms_overflow_pages = db->large_pages;
st->ms_entries = db->items;
if (likely(bytes >= offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
st->ms_mod_txnid = db->mod_txnid;
}
__cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(txn->flags & MDBX_TXN_BLOCKED))
return LOG_IFERR(MDBX_BAD_TXN);
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
rc = tbl_fetch((MDBX_txn *)txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
if (unlikely(!dest))
return LOG_IFERR(MDBX_EINVAL);
const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
return LOG_IFERR(MDBX_EINVAL);
dest->ms_psize = txn->env->ps;
stat_get(&txn->dbs[dbi], dest, bytes);
return MDBX_SUCCESS;
}
__cold int mdbx_enumerate_tables(const MDBX_txn *txn, MDBX_table_enum_func *func, void *ctx) {
if (unlikely(!func))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
for (rc = outer_first(&cx.outer, nullptr, nullptr); rc == MDBX_SUCCESS;
rc = outer_next(&cx.outer, nullptr, nullptr, MDBX_NEXT_NODUP)) {
node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
if (node_flags(node) != N_TREE)
continue;
if (unlikely(node_ds(node) != sizeof(tree_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid dupsort sub-tree node size",
(unsigned)node_ds(node));
rc = MDBX_CORRUPTED;
break;
}
tree_t reside;
const tree_t *tree = memcpy(&reside, node_data(node), sizeof(reside));
const MDBX_val name = {node_key(node), node_ks(node)};
const MDBX_env *const env = txn->env;
MDBX_dbi dbi = 0;
for (size_t i = CORE_DBS; i < env->n_dbi; ++i) {
if (i >= txn->n_dbi || !(env->dbs_flags[i] & DB_VALID))
continue;
if (env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[i].name))
continue;
tree = dbi_dig(txn, i, &reside);
dbi = (MDBX_dbi)i;
break;
}
MDBX_stat stat;
stat_get(tree, &stat, sizeof(stat));
rc = func(ctx, txn, &name, tree->flags, &stat, dbi);
if (rc != MDBX_SUCCESS)
goto bailout;
}
rc = (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
bailout:
txn->cursors[MAIN_DBI] = cx.outer.next;
return LOG_IFERR(rc);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,165 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
/*------------------------------------------------------------------------------
* Readers API */
__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, void *ctx) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!func))
return LOG_IFERR(MDBX_EINVAL);
rc = MDBX_RESULT_TRUE;
int serial = 0;
lck_t *const lck = env->lck_mmap.lck;
if (likely(lck)) {
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
for (size_t i = 0; i < snap_nreaders; i++) {
const reader_slot_t *r = lck->rdt + i;
retry_reader:;
const uint32_t pid = atomic_load32(&r->pid, mo_AcquireRelease);
if (!pid)
continue;
txnid_t txnid = safe64_read(&r->txnid);
const uint64_t tid = atomic_load64(&r->tid, mo_Relaxed);
const pgno_t pages_used = atomic_load32(&r->snapshot_pages_used, mo_Relaxed);
const uint64_t reader_pages_retired = atomic_load64(&r->snapshot_pages_retired, mo_Relaxed);
if (unlikely(txnid != safe64_read(&r->txnid) || pid != atomic_load32(&r->pid, mo_AcquireRelease) ||
tid != atomic_load64(&r->tid, mo_Relaxed) ||
pages_used != atomic_load32(&r->snapshot_pages_used, mo_Relaxed) ||
reader_pages_retired != atomic_load64(&r->snapshot_pages_retired, mo_Relaxed)))
goto retry_reader;
eASSERT(env, txnid > 0);
if (txnid >= SAFE64_INVALID_THRESHOLD)
txnid = 0;
size_t bytes_used = 0;
size_t bytes_retained = 0;
uint64_t lag = 0;
if (txnid) {
troika_t troika = meta_tap(env);
retry_header:;
const meta_ptr_t head = meta_recent(env, &troika);
const uint64_t head_pages_retired = unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired);
if (unlikely(meta_should_retry(env, &troika) ||
head_pages_retired != unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired)))
goto retry_header;
lag = (head.txnid - txnid) / xMDBX_TXNID_STEP;
bytes_used = pgno2bytes(env, pages_used);
bytes_retained = (head_pages_retired > reader_pages_retired)
? pgno2bytes(env, (pgno_t)(head_pages_retired - reader_pages_retired))
: 0;
}
rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)((intptr_t)tid), txnid, lag, bytes_used, bytes_retained);
if (unlikely(rc != MDBX_SUCCESS))
break;
}
}
return LOG_IFERR(rc);
}
__cold int mdbx_reader_check(MDBX_env *env, int *dead) {
if (dead)
*dead = 0;
return LOG_IFERR(mvcc_cleanup_dead(env, false, dead));
}
__cold int mdbx_thread_register(const MDBX_env *env) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!env->lck_mmap.lck))
return LOG_IFERR((env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM);
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
return LOG_IFERR(MDBX_EINVAL) /* MDBX_NOSTICKYTHREADS mode */;
}
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
reader_slot_t *r = thread_rthc_get(env->me_txkey);
if (unlikely(r != nullptr)) {
eASSERT(env, r->pid.weak == env->pid);
eASSERT(env, r->tid.weak == osal_thread_self());
if (unlikely(r->pid.weak != env->pid))
return LOG_IFERR(MDBX_BAD_RSLOT);
return MDBX_RESULT_TRUE /* already registered */;
}
return LOG_IFERR(mvcc_bind_slot((MDBX_env *)env).err);
}
__cold int mdbx_thread_unregister(const MDBX_env *env) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!env->lck_mmap.lck))
return MDBX_RESULT_TRUE;
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */;
}
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
reader_slot_t *r = thread_rthc_get(env->me_txkey);
if (unlikely(r == nullptr))
return MDBX_RESULT_TRUE /* not registered */;
eASSERT(env, r->pid.weak == env->pid);
if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self()))
return LOG_IFERR(MDBX_BAD_RSLOT);
eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD);
if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD))
return LOG_IFERR(MDBX_BUSY) /* transaction is still active */;
atomic_store32(&r->pid, 0, mo_Relaxed);
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease);
thread_rthc_set(env->me_txkey, nullptr);
return MDBX_SUCCESS;
}
/*------------------------------------------------------------------------------
* Locking API */
int mdbx_txn_lock(MDBX_env *env, bool dont_wait) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(env->flags & MDBX_RDONLY))
return LOG_IFERR(MDBX_EACCESS);
if (dont_wait && unlikely(env->basal_txn->owner || (env->basal_txn->flags & MDBX_TXN_FINISHED) == 0))
return LOG_IFERR(MDBX_BUSY);
return LOG_IFERR(lck_txn_lock(env, dont_wait));
}
int mdbx_txn_unlock(MDBX_env *env) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(env->flags & MDBX_RDONLY))
return LOG_IFERR(MDBX_EACCESS);
#if MDBX_TXN_CHECKOWNER
if (unlikely(env->basal_txn->owner != osal_thread_self()))
return LOG_IFERR(MDBX_THREAD_MISMATCH);
#endif /* MDBX_TXN_CHECKOWNER */
if (unlikely((env->basal_txn->flags & MDBX_TXN_FINISHED) == 0))
return LOG_IFERR(MDBX_BUSY);
lck_txn_unlock(env);
return MDBX_SUCCESS;
}

View File

@ -1,197 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
static inline double key2double(const int64_t key) {
union {
uint64_t u;
double f;
} casting;
casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000) : UINT64_C(0xffffFFFFffffFFFF) - key;
return casting.f;
}
static inline uint64_t double2key(const double *const ptr) {
STATIC_ASSERT(sizeof(double) == sizeof(int64_t));
const int64_t i = *(const int64_t *)ptr;
const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i : i + UINT64_C(0x8000000000000000);
if (ASSERT_ENABLED()) {
const double f = key2double(u);
assert(memcmp(&f, ptr, sizeof(double)) == 0);
}
return u;
}
static inline float key2float(const int32_t key) {
union {
uint32_t u;
float f;
} casting;
casting.u = (key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key;
return casting.f;
}
static inline uint32_t float2key(const float *const ptr) {
STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
const int32_t i = *(const int32_t *)ptr;
const uint32_t u = (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000);
if (ASSERT_ENABLED()) {
const float f = key2float(u);
assert(memcmp(&f, ptr, sizeof(float)) == 0);
}
return u;
}
uint64_t mdbx_key_from_double(const double ieee754_64bit) { return double2key(&ieee754_64bit); }
uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) { return double2key(ieee754_64bit); }
uint32_t mdbx_key_from_float(const float ieee754_32bit) { return float2key(&ieee754_32bit); }
uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { return float2key(ieee754_32bit); }
#define IEEE754_DOUBLE_MANTISSA_SIZE 52
#define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF
#define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF
#define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000)
#define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF)
#define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF)
static inline int clz64(uint64_t value) {
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl)
if (sizeof(value) == sizeof(int))
return __builtin_clz(value);
if (sizeof(value) == sizeof(long))
return __builtin_clzl(value);
#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || __has_builtin(__builtin_clzll)
return __builtin_clzll(value);
#endif /* have(long long) && long long == uint64_t */
#endif /* GNU C */
#if defined(_MSC_VER)
unsigned long index;
#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
_BitScanReverse64(&index, value);
return 63 - index;
#else
if (value > UINT32_MAX) {
_BitScanReverse(&index, (uint32_t)(value >> 32));
return 31 - index;
}
_BitScanReverse(&index, (uint32_t)value);
return 63 - index;
#endif
#endif /* MSVC */
value |= value >> 1;
value |= value >> 2;
value |= value >> 4;
value |= value >> 8;
value |= value >> 16;
value |= value >> 32;
static const uint8_t debruijn_clz64[64] = {63, 16, 62, 7, 15, 36, 61, 3, 6, 14, 22, 26, 35, 47, 60, 2,
9, 5, 28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1,
17, 8, 37, 4, 23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18,
38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0};
return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58];
}
static inline uint64_t round_mantissa(const uint64_t u64, int shift) {
assert(shift < 0 && u64 > 0);
shift = -shift;
const unsigned half = 1 << (shift - 1);
const unsigned lsb = 1 & (unsigned)(u64 >> shift);
const unsigned tie2even = 1 ^ lsb;
return (u64 + half - tie2even) >> shift;
}
uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) {
const uint64_t bias = UINT64_C(0x8000000000000000);
if (json_integer > 0) {
const uint64_t u64 = json_integer;
int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
uint64_t mantissa = u64 << shift;
if (unlikely(shift < 0)) {
mantissa = round_mantissa(u64, shift);
if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
mantissa = round_mantissa(u64, --shift);
}
assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift;
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
#if !defined(_MSC_VER) || defined(_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
symbol __except1 referenced in function __ftol3_except */
assert(key == mdbx_key_from_double((double)json_integer));
#endif /* Workaround for MSVC */
return key;
}
if (json_integer < 0) {
const uint64_t u64 = -json_integer;
int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
uint64_t mantissa = u64 << shift;
if (unlikely(shift < 0)) {
mantissa = round_mantissa(u64, shift);
if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
mantissa = round_mantissa(u64, --shift);
}
assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift;
assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
const uint64_t key =
bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
#if !defined(_MSC_VER) || defined(_DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \
symbol __except1 referenced in function __ftol3_except */
assert(key == mdbx_key_from_double((double)json_integer));
#endif /* Workaround for MSVC */
return key;
}
return bias;
}
int64_t mdbx_jsonInteger_from_key(const MDBX_val v) {
assert(v.iov_len == 8);
const uint64_t key = unaligned_peek_u64(2, v.iov_base);
const uint64_t bias = UINT64_C(0x8000000000000000);
const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1;
const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 -
(IEEE754_DOUBLE_EXPONENTA_MAX & (int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE));
if (unlikely(shift < 1))
return (key < bias) ? INT64_MIN : INT64_MAX;
if (unlikely(shift > 63))
return 0;
const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK) << (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) + bias;
const int64_t absolute = unscaled >> shift;
const int64_t value = (key < bias) ? -absolute : absolute;
assert(key == mdbx_key_from_jsonInteger(value) ||
(mdbx_key_from_jsonInteger(value - 1) < key && key < mdbx_key_from_jsonInteger(value + 1)));
return value;
}
double mdbx_double_from_key(const MDBX_val v) {
assert(v.iov_len == 8);
return key2double(unaligned_peek_u64(2, v.iov_base));
}
float mdbx_float_from_key(const MDBX_val v) {
assert(v.iov_len == 4);
return key2float(unaligned_peek_u32(2, v.iov_base));
}
int32_t mdbx_int32_from_key(const MDBX_val v) {
assert(v.iov_len == 4);
return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000));
}
int64_t mdbx_int64_from_key(const MDBX_val v) {
assert(v.iov_len == 8);
return (int64_t)(unaligned_peek_u64(2, v.iov_base) - UINT64_C(0x8000000000000000));
}

View File

@ -1,286 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
__cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) {
if (volume <= 1024 * 1024 * 4ul)
return MDBX_RESULT_TRUE;
intptr_t pagesize, total_ram_pages;
int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
const int log2page = log2n_powerof2(pagesize);
const intptr_t volume_pages = (volume + pagesize - 1) >> log2page;
const intptr_t redundancy_pages = (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page)
: (intptr_t)(redundancy + pagesize - 1) >> log2page;
if (volume_pages >= total_ram_pages || volume_pages + redundancy_pages >= total_ram_pages)
return MDBX_RESULT_FALSE;
intptr_t avail_ram_pages;
err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
return (volume_pages + redundancy_pages >= avail_ram_pages) ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE;
}
int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, uint64_t increment) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = dbi_check(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) {
rc = tbl_fetch(txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
tree_t *dbs = &txn->dbs[dbi];
if (likely(result))
*result = dbs->sequence;
if (likely(increment > 0)) {
if (unlikely(dbi == FREE_DBI || (txn->flags & MDBX_TXN_RDONLY) != 0))
return MDBX_EACCESS;
uint64_t new = dbs->sequence + increment;
if (unlikely(new < increment))
return MDBX_RESULT_TRUE;
tASSERT(txn, new > dbs->sequence);
if ((txn->dbi_state[dbi] & DBI_DIRTY) == 0) {
txn->flags |= MDBX_TXN_DIRTY;
txn->dbi_state[dbi] |= DBI_DIRTY;
if (unlikely(dbi == MAIN_DBI) && txn->dbs[MAIN_DBI].root != P_INVALID) {
/* LY: Временная подпорка для coherency_check(), которую в перспективе
* следует заменить вместе с переделкой установки mod_txnid.
*
* Суть проблемы:
* - coherency_check() в качестве одного из критериев "когерентности"
* проверяет условие meta.maindb.mod_txnid == maindb.root->txnid;
* - при обновлении maindb.sequence высталяется DBI_DIRTY, что приведет
* к обновлению meta.maindb.mod_txnid = current_txnid;
* - однако, если в само дерево maindb обновление не вносились и оно
* не пустое, то корневая страницы останеться с прежним txnid и из-за
* этого ложно сработает coherency_check().
*
* Временное (текущее) решение: Принудительно обновляем корневую
* страницу в описанной выше ситуации. Это устраняет проблему, но и
* не создает рисков регресса.
*
* FIXME: Итоговое решение, которое предстоит реализовать:
* - изменить семантику установки/обновления mod_txnid, привязав его
* строго к изменению b-tree, но не атрибутов;
* - обновлять mod_txnid при фиксации вложенных транзакций;
* - для dbi-хендлов пользовательских table (видимо) можно оставить
* DBI_DIRTY в качестве признака необходимости обновления записи
* table в MainDB, при этом взводить DBI_DIRTY вместе с обновлением
* mod_txnid, в том числе при обновлении sequence.
* - для MAIN_DBI при обновлении sequence не следует взводить DBI_DIRTY
* и/или обновлять mod_txnid, а только взводить MDBX_TXN_DIRTY.
* - альтернативно, можно перераспределить флажки-признаки dbi_state,
* чтобы различать состояние dirty-tree и dirty-attributes. */
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = tree_search(&cx.outer, nullptr, Z_MODIFY | Z_ROOTONLY);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
}
dbs->sequence = new;
}
return MDBX_SUCCESS;
}
int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) {
eASSERT(nullptr, txn->signature == txn_signature);
tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi));
tASSERT(txn, dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID) != 0);
return txn->env->kvs[dbi].clc.k.cmp(a, b);
}
int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) {
eASSERT(nullptr, txn->signature == txn_signature);
tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi));
tASSERT(txn, dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID));
return txn->env->kvs[dbi].clc.v.cmp(a, b);
}
__cold MDBX_cmp_func *mdbx_get_keycmp(MDBX_db_flags_t flags) { return builtin_keycmp(flags); }
__cold MDBX_cmp_func *mdbx_get_datacmp(MDBX_db_flags_t flags) { return builtin_datacmp(flags); }
/*----------------------------------------------------------------------------*/
__cold const char *mdbx_liberr2str(int errnum) {
/* Table of descriptions for MDBX errors */
static const char *const tbl[] = {
"MDBX_KEYEXIST: Key/data pair already exists",
"MDBX_NOTFOUND: No matching key/data pair found",
"MDBX_PAGE_NOTFOUND: Requested page not found",
"MDBX_CORRUPTED: Database is corrupted",
"MDBX_PANIC: Environment had fatal error",
"MDBX_VERSION_MISMATCH: DB version mismatch libmdbx",
"MDBX_INVALID: File is not an MDBX file",
"MDBX_MAP_FULL: Environment mapsize limit reached",
"MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)",
"MDBX_READERS_FULL: Too many readers (maxreaders reached)",
nullptr /* MDBX_TLS_FULL (-30789): unused in MDBX */,
"MDBX_TXN_FULL: Transaction has too many dirty pages,"
" i.e transaction is too big",
"MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates"
" corruption, i.e branch-pages loop",
"MDBX_PAGE_FULL: Internal error - Page has no more space",
"MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend"
" mapping, e.g. since address space is unavailable or busy,"
" or Operation system not supported such operations",
"MDBX_INCOMPATIBLE: Environment or database is not compatible"
" with the requested operation or the specified flags",
"MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot,"
" e.g. read-transaction already run for current thread",
"MDBX_BAD_TXN: Transaction is not valid for requested operation,"
" e.g. had errored and be must aborted, has a child, or is invalid",
"MDBX_BAD_VALSIZE: Invalid size or alignment of key or data"
" for target database, either invalid table name",
"MDBX_BAD_DBI: The specified DBI-handle is invalid"
" or changed by another thread/transaction",
"MDBX_PROBLEM: Unexpected internal error, transaction should be aborted",
"MDBX_BUSY: Another write transaction is running,"
" or environment is already used while opening with MDBX_EXCLUSIVE flag",
};
if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) {
int i = errnum - MDBX_KEYEXIST;
return tbl[i];
}
switch (errnum) {
case MDBX_SUCCESS:
return "MDBX_SUCCESS: Successful";
case MDBX_EMULTIVAL:
return "MDBX_EMULTIVAL: The specified key has"
" more than one associated value";
case MDBX_EBADSIGN:
return "MDBX_EBADSIGN: Wrong signature of a runtime object(s),"
" e.g. memory corruption or double-free";
case MDBX_WANNA_RECOVERY:
return "MDBX_WANNA_RECOVERY: Database should be recovered,"
" but this could NOT be done automatically for now"
" since it opened in read-only mode";
case MDBX_EKEYMISMATCH:
return "MDBX_EKEYMISMATCH: The given key value is mismatched to the"
" current cursor position";
case MDBX_TOO_LARGE:
return "MDBX_TOO_LARGE: Database is too large for current system,"
" e.g. could NOT be mapped into RAM";
case MDBX_THREAD_MISMATCH:
return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not"
" owned object, e.g. a transaction that started by another thread";
case MDBX_TXN_OVERLAPPING:
return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for"
" the current thread";
case MDBX_DUPLICATED_CLK:
return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists,"
" please keep one and remove unused other";
case MDBX_DANGLING_DBI:
return "MDBX_DANGLING_DBI: Some cursors and/or other resources should be"
" closed before table or corresponding DBI-handle could be (re)used";
case MDBX_OUSTED:
return "MDBX_OUSTED: The parked read transaction was outed for the sake"
" of recycling old MVCC snapshots";
case MDBX_MVCC_RETARDED:
return "MDBX_MVCC_RETARDED: MVCC snapshot used by parked transaction was bygone";
default:
return nullptr;
}
}
__cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) {
const char *msg = mdbx_liberr2str(errnum);
if (!msg && buflen > 0 && buflen < INT_MAX) {
#if defined(_WIN32) || defined(_WIN64)
DWORD size = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, errnum,
MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, nullptr);
while (size && buf[size - 1] <= ' ')
--size;
buf[size] = 0;
return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
#elif defined(_GNU_SOURCE) && defined(__GLIBC__)
/* GNU-specific */
if (errnum > 0)
msg = strerror_r(errnum, buf, buflen);
#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600)
/* XSI-compliant */
if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0)
msg = buf;
#else
if (errnum > 0) {
msg = strerror(errnum);
if (msg) {
strncpy(buf, msg, buflen);
msg = buf;
}
}
#endif
if (!msg) {
(void)snprintf(buf, buflen, "error %d", errnum);
msg = buf;
}
buf[buflen - 1] = '\0';
}
return msg;
}
__cold const char *mdbx_strerror(int errnum) {
#if defined(_WIN32) || defined(_WIN64)
static char buf[1024];
return mdbx_strerror_r(errnum, buf, sizeof(buf));
#else
const char *msg = mdbx_liberr2str(errnum);
if (!msg) {
if (errnum > 0)
msg = strerror(errnum);
if (!msg) {
static char buf[32];
(void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum);
msg = buf;
}
}
return msg;
#endif
}
#if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */
const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) {
const char *msg = mdbx_liberr2str(errnum);
if (!msg && buflen > 0 && buflen < INT_MAX) {
DWORD size = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, errnum,
MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, nullptr);
while (size && buf[size - 1] <= ' ')
--size;
buf[size] = 0;
if (!size)
msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
else if (!CharToOemBuffA(buf, buf, size))
msg = "CharToOemBuffA() failed";
else
msg = buf;
}
return msg;
}
const char *mdbx_strerror_ANSI2OEM(int errnum) {
static char buf[1024];
return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf));
}
#endif /* Bit of madness for Windows */

View File

@ -1,578 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
static pgno_t env_max_pgno(const MDBX_env *env) {
return env->ps ? bytes2pgno(env, env->geo_in_bytes.upper ? env->geo_in_bytes.upper : MAX_MAPSIZE) : PAGELIST_LIMIT;
}
__cold pgno_t default_dp_limit(const MDBX_env *env) {
/* auto-setup dp_limit by "The42" ;-) */
intptr_t total_ram_pages, avail_ram_pages;
int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages);
pgno_t dp_limit = 1024;
if (unlikely(err != MDBX_SUCCESS))
ERROR("mdbx_get_sysraminfo(), rc %d", err);
else {
size_t estimate = (size_t)(total_ram_pages + avail_ram_pages) / 42;
if (env->ps) {
if (env->ps > globals.sys_pagesize)
estimate /= env->ps / globals.sys_pagesize;
else if (env->ps < globals.sys_pagesize)
estimate *= globals.sys_pagesize / env->ps;
}
dp_limit = (pgno_t)estimate;
}
dp_limit = (dp_limit < PAGELIST_LIMIT) ? dp_limit : PAGELIST_LIMIT;
const pgno_t max_pgno = env_max_pgno(env);
if (dp_limit > max_pgno - NUM_METAS)
dp_limit = max_pgno - NUM_METAS;
dp_limit = (dp_limit > CURSOR_STACK_SIZE * 4) ? dp_limit : CURSOR_STACK_SIZE * 4;
return dp_limit;
}
__cold static pgno_t default_rp_augment_limit(const MDBX_env *env) {
const size_t timeframe = /* 16 секунд */ 16 << 16;
const size_t remain_1sec =
(env->options.gc_time_limit < timeframe) ? timeframe - (size_t)env->options.gc_time_limit : 0;
const size_t minimum = (env->maxgc_large1page * 2 > MDBX_PNL_INITIAL) ? env->maxgc_large1page * 2 : MDBX_PNL_INITIAL;
const size_t one_third = env->geo_in_bytes.now / 3 >> env->ps2ln;
const size_t augment_limit =
(one_third > minimum) ? minimum + (one_third - minimum) / timeframe * remain_1sec : minimum;
eASSERT(env, augment_limit < PAGELIST_LIMIT);
return pnl_bytes2size(pnl_size2bytes(augment_limit));
}
static bool default_prefault_write(const MDBX_env *env) {
return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->incore &&
(env->flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP;
}
static bool default_prefer_waf_insteadof_balance(const MDBX_env *env) {
(void)env;
return false;
}
static uint16_t default_subpage_limit(const MDBX_env *env) {
(void)env;
return 65535 /* 100% */;
}
static uint16_t default_subpage_room_threshold(const MDBX_env *env) {
(void)env;
return 0 /* 0% */;
}
static uint16_t default_subpage_reserve_prereq(const MDBX_env *env) {
(void)env;
return 27525 /* 42% */;
}
static uint16_t default_subpage_reserve_limit(const MDBX_env *env) {
(void)env;
return 2753 /* 4.2% */;
}
static uint16_t default_merge_threshold_16dot16_percent(const MDBX_env *env) {
(void)env;
return 65536 / 4 /* 25% */;
}
static pgno_t default_dp_reserve_limit(const MDBX_env *env) {
(void)env;
return MDBX_PNL_INITIAL;
}
static pgno_t default_dp_initial(const MDBX_env *env) {
(void)env;
return MDBX_PNL_INITIAL;
}
static uint8_t default_spill_max_denominator(const MDBX_env *env) {
(void)env;
return 8;
}
static uint8_t default_spill_min_denominator(const MDBX_env *env) {
(void)env;
return 8;
}
static uint8_t default_spill_parent4child_denominator(const MDBX_env *env) {
(void)env;
return 0;
}
static uint8_t default_dp_loose_limit(const MDBX_env *env) {
(void)env;
return 64;
}
void env_options_init(MDBX_env *env) {
env->options.rp_augment_limit = default_rp_augment_limit(env);
env->options.dp_reserve_limit = default_dp_reserve_limit(env);
env->options.dp_initial = default_dp_initial(env);
env->options.dp_limit = default_dp_limit(env);
env->options.spill_max_denominator = default_spill_max_denominator(env);
env->options.spill_min_denominator = default_spill_min_denominator(env);
env->options.spill_parent4child_denominator = default_spill_parent4child_denominator(env);
env->options.dp_loose_limit = default_dp_loose_limit(env);
env->options.merge_threshold_16dot16_percent = default_merge_threshold_16dot16_percent(env);
if (default_prefer_waf_insteadof_balance(env))
env->options.prefer_waf_insteadof_balance = true;
#if !(defined(_WIN32) || defined(_WIN64))
env->options.writethrough_threshold =
#if defined(__linux__) || defined(__gnu_linux__)
globals.running_on_WSL1 ? MAX_PAGENO :
#endif /* Linux */
MDBX_WRITETHROUGH_THRESHOLD_DEFAULT;
#endif /* Windows */
env->options.subpage.limit = default_subpage_limit(env);
env->options.subpage.room_threshold = default_subpage_room_threshold(env);
env->options.subpage.reserve_prereq = default_subpage_reserve_prereq(env);
env->options.subpage.reserve_limit = default_subpage_reserve_limit(env);
}
void env_options_adjust_dp_limit(MDBX_env *env) {
if (!env->options.flags.non_auto.dp_limit)
env->options.dp_limit = default_dp_limit(env);
else {
const pgno_t max_pgno = env_max_pgno(env);
if (env->options.dp_limit > max_pgno - NUM_METAS)
env->options.dp_limit = max_pgno - NUM_METAS;
if (env->options.dp_limit < CURSOR_STACK_SIZE * 4)
env->options.dp_limit = CURSOR_STACK_SIZE * 4;
}
#ifdef MDBX_DEBUG_DPL_LIMIT
env->options.dp_limit = MDBX_DEBUG_DPL_LIMIT;
#endif /* MDBX_DEBUG_DPL_LIMIT */
if (env->options.dp_initial > env->options.dp_limit && env->options.dp_initial > default_dp_initial(env))
env->options.dp_initial = env->options.dp_limit;
env->options.need_dp_limit_adjust = false;
}
void env_options_adjust_defaults(MDBX_env *env) {
if (!env->options.flags.non_auto.rp_augment_limit)
env->options.rp_augment_limit = default_rp_augment_limit(env);
if (!env->options.flags.non_auto.prefault_write)
env->options.prefault_write = default_prefault_write(env);
env->options.need_dp_limit_adjust = true;
if (!env->txn)
env_options_adjust_dp_limit(env);
const size_t basis = env->geo_in_bytes.now;
/* TODO: use options? */
const unsigned factor = 9;
size_t threshold = (basis < ((size_t)65536 << factor)) ? 65536 /* minimal threshold */
: (basis > (MEGABYTE * 4 << factor)) ? MEGABYTE * 4 /* maximal threshold */
: basis >> factor;
threshold =
(threshold < env->geo_in_bytes.shrink || !env->geo_in_bytes.shrink) ? threshold : env->geo_in_bytes.shrink;
env->madv_threshold = bytes2pgno(env, bytes_align2os_bytes(env, threshold));
}
//------------------------------------------------------------------------------
__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, uint64_t value) {
int err = check_env(env, false);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
const bool lock_needed = ((env->flags & ENV_ACTIVE) && env->basal_txn && !env_owned_wrtxn(env));
bool should_unlock = false;
switch (option) {
case MDBX_opt_sync_bytes:
if (value == /* default */ UINT64_MAX)
value = MAX_WRITE;
if (unlikely(env->flags & MDBX_RDONLY))
return LOG_IFERR(MDBX_EACCESS);
if (unlikely(!(env->flags & ENV_ACTIVE)))
return LOG_IFERR(MDBX_EPERM);
if (unlikely(value > SIZE_MAX - 65536))
return LOG_IFERR(MDBX_EINVAL);
value = bytes2pgno(env, (size_t)value + env->ps - 1);
if ((uint32_t)value != atomic_load32(&env->lck->autosync_threshold, mo_AcquireRelease) &&
atomic_store32(&env->lck->autosync_threshold, (uint32_t)value, mo_Relaxed)
/* Дергаем sync(force=off) только если задано новое не-нулевое значение
* и мы вне транзакции */
&& lock_needed) {
err = env_sync(env, false, false);
if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE)
err = MDBX_SUCCESS;
}
break;
case MDBX_opt_sync_period:
if (value == /* default */ UINT64_MAX)
value = 2780315 /* 42.42424 секунды */;
if (unlikely(env->flags & MDBX_RDONLY))
return LOG_IFERR(MDBX_EACCESS);
if (unlikely(!(env->flags & ENV_ACTIVE)))
return LOG_IFERR(MDBX_EPERM);
if (unlikely(value > UINT32_MAX))
return LOG_IFERR(MDBX_EINVAL);
value = osal_16dot16_to_monotime((uint32_t)value);
if (value != atomic_load64(&env->lck->autosync_period, mo_AcquireRelease) &&
atomic_store64(&env->lck->autosync_period, value, mo_Relaxed)
/* Дергаем sync(force=off) только если задано новое не-нулевое значение
* и мы вне транзакции */
&& lock_needed) {
err = env_sync(env, false, false);
if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE)
err = MDBX_SUCCESS;
}
break;
case MDBX_opt_max_db:
if (value == /* default */ UINT64_MAX)
value = 42;
if (unlikely(value > MDBX_MAX_DBI))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(env->dxb_mmap.base))
return LOG_IFERR(MDBX_EPERM);
env->max_dbi = (unsigned)value + CORE_DBS;
break;
case MDBX_opt_max_readers:
if (value == /* default */ UINT64_MAX)
value = MDBX_READERS_LIMIT;
if (unlikely(value < 1 || value > MDBX_READERS_LIMIT))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(env->dxb_mmap.base))
return LOG_IFERR(MDBX_EPERM);
env->max_readers = (unsigned)value;
break;
case MDBX_opt_dp_reserve_limit:
if (value == /* default */ UINT64_MAX)
value = default_dp_reserve_limit(env);
if (unlikely(value > INT_MAX))
return LOG_IFERR(MDBX_EINVAL);
if (env->options.dp_reserve_limit != (unsigned)value) {
if (lock_needed) {
err = lck_txn_lock(env, false);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
should_unlock = true;
}
env->options.dp_reserve_limit = (unsigned)value;
while (env->shadow_reserve_len > env->options.dp_reserve_limit) {
eASSERT(env, env->shadow_reserve != nullptr);
page_t *dp = env->shadow_reserve;
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->ps);
VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *));
env->shadow_reserve = page_next(dp);
void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
osal_free(ptr);
env->shadow_reserve_len -= 1;
}
}
break;
case MDBX_opt_rp_augment_limit:
if (value == /* default */ UINT64_MAX) {
env->options.flags.non_auto.rp_augment_limit = 0;
env->options.rp_augment_limit = default_rp_augment_limit(env);
} else if (unlikely(value > PAGELIST_LIMIT))
return LOG_IFERR(MDBX_EINVAL);
else {
env->options.flags.non_auto.rp_augment_limit = 1;
env->options.rp_augment_limit = (unsigned)value;
}
break;
case MDBX_opt_gc_time_limit:
if (value == /* default */ UINT64_MAX)
value = 0;
if (unlikely(value > UINT32_MAX))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(env->flags & MDBX_RDONLY))
return LOG_IFERR(MDBX_EACCESS);
value = osal_16dot16_to_monotime((uint32_t)value);
if (value != env->options.gc_time_limit) {
if (env->txn && lock_needed)
return LOG_IFERR(MDBX_EPERM);
env->options.gc_time_limit = value;
if (!env->options.flags.non_auto.rp_augment_limit)
env->options.rp_augment_limit = default_rp_augment_limit(env);
}
break;
case MDBX_opt_txn_dp_limit:
case MDBX_opt_txn_dp_initial:
if (value != /* default */ UINT64_MAX && unlikely(value > PAGELIST_LIMIT || value < CURSOR_STACK_SIZE * 4))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(env->flags & MDBX_RDONLY))
return LOG_IFERR(MDBX_EACCESS);
if (lock_needed) {
err = lck_txn_lock(env, false);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
should_unlock = true;
}
if (env->txn)
err = MDBX_EPERM /* unable change during transaction */;
else {
const pgno_t max_pgno = env_max_pgno(env);
if (option == MDBX_opt_txn_dp_initial) {
if (value == /* default */ UINT64_MAX)
env->options.dp_initial = default_dp_initial(env);
else {
env->options.dp_initial = (pgno_t)value;
if (env->options.dp_initial > max_pgno)
env->options.dp_initial = (max_pgno > CURSOR_STACK_SIZE * 4) ? max_pgno : CURSOR_STACK_SIZE * 4;
}
}
if (option == MDBX_opt_txn_dp_limit) {
if (value == /* default */ UINT64_MAX) {
env->options.flags.non_auto.dp_limit = 0;
} else {
env->options.flags.non_auto.dp_limit = 1;
env->options.dp_limit = (pgno_t)value;
}
env_options_adjust_dp_limit(env);
}
}
break;
case MDBX_opt_spill_max_denominator:
if (value == /* default */ UINT64_MAX)
value = default_spill_max_denominator(env);
if (unlikely(value > 255))
return LOG_IFERR(MDBX_EINVAL);
env->options.spill_max_denominator = (uint8_t)value;
break;
case MDBX_opt_spill_min_denominator:
if (value == /* default */ UINT64_MAX)
value = default_spill_min_denominator(env);
if (unlikely(value > 255))
return LOG_IFERR(MDBX_EINVAL);
env->options.spill_min_denominator = (uint8_t)value;
break;
case MDBX_opt_spill_parent4child_denominator:
if (value == /* default */ UINT64_MAX)
value = default_spill_parent4child_denominator(env);
if (unlikely(value > 255))
return LOG_IFERR(MDBX_EINVAL);
env->options.spill_parent4child_denominator = (uint8_t)value;
break;
case MDBX_opt_loose_limit:
if (value == /* default */ UINT64_MAX)
value = default_dp_loose_limit(env);
if (unlikely(value > 255))
return LOG_IFERR(MDBX_EINVAL);
env->options.dp_loose_limit = (uint8_t)value;
break;
case MDBX_opt_merge_threshold_16dot16_percent:
if (value == /* default */ UINT64_MAX)
value = default_merge_threshold_16dot16_percent(env);
if (unlikely(value < 8192 || value > 32768))
return LOG_IFERR(MDBX_EINVAL);
env->options.merge_threshold_16dot16_percent = (unsigned)value;
recalculate_merge_thresholds(env);
break;
case MDBX_opt_writethrough_threshold:
#if defined(_WIN32) || defined(_WIN64)
/* позволяем "установить" значение по-умолчанию и совпадающее
* с поведением соответствующим текущей установке MDBX_NOMETASYNC */
if (value == /* default */ UINT64_MAX && value != ((env->flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX))
err = MDBX_EINVAL;
#else
if (value == /* default */ UINT64_MAX)
value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT;
if (value != (unsigned)value)
err = MDBX_EINVAL;
else
env->options.writethrough_threshold = (unsigned)value;
#endif
break;
case MDBX_opt_prefault_write_enable:
if (value == /* default */ UINT64_MAX) {
env->options.prefault_write = default_prefault_write(env);
env->options.flags.non_auto.prefault_write = false;
} else if (value > 1)
err = MDBX_EINVAL;
else {
env->options.prefault_write = value != 0;
env->options.flags.non_auto.prefault_write = true;
}
break;
case MDBX_opt_prefer_waf_insteadof_balance:
if (value == /* default */ UINT64_MAX)
env->options.prefer_waf_insteadof_balance = default_prefer_waf_insteadof_balance(env);
else if (value > 1)
err = MDBX_EINVAL;
else
env->options.prefer_waf_insteadof_balance = value != 0;
break;
case MDBX_opt_subpage_limit:
if (value == /* default */ UINT64_MAX) {
env->options.subpage.limit = default_subpage_limit(env);
recalculate_subpage_thresholds(env);
} else if (value > 65535)
err = MDBX_EINVAL;
else {
env->options.subpage.limit = (uint16_t)value;
recalculate_subpage_thresholds(env);
}
break;
case MDBX_opt_subpage_room_threshold:
if (value == /* default */ UINT64_MAX) {
env->options.subpage.room_threshold = default_subpage_room_threshold(env);
recalculate_subpage_thresholds(env);
} else if (value > 65535)
err = MDBX_EINVAL;
else {
env->options.subpage.room_threshold = (uint16_t)value;
recalculate_subpage_thresholds(env);
}
break;
case MDBX_opt_subpage_reserve_prereq:
if (value == /* default */ UINT64_MAX) {
env->options.subpage.reserve_prereq = default_subpage_reserve_prereq(env);
recalculate_subpage_thresholds(env);
} else if (value > 65535)
err = MDBX_EINVAL;
else {
env->options.subpage.reserve_prereq = (uint16_t)value;
recalculate_subpage_thresholds(env);
}
break;
case MDBX_opt_subpage_reserve_limit:
if (value == /* default */ UINT64_MAX) {
env->options.subpage.reserve_limit = default_subpage_reserve_limit(env);
recalculate_subpage_thresholds(env);
} else if (value > 65535)
err = MDBX_EINVAL;
else {
env->options.subpage.reserve_limit = (uint16_t)value;
recalculate_subpage_thresholds(env);
}
break;
default:
return LOG_IFERR(MDBX_EINVAL);
}
if (should_unlock)
lck_txn_unlock(env);
return LOG_IFERR(err);
}
__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, uint64_t *pvalue) {
int err = check_env(env, false);
if (unlikely(err != MDBX_SUCCESS))
return LOG_IFERR(err);
if (unlikely(!pvalue))
return LOG_IFERR(MDBX_EINVAL);
switch (option) {
case MDBX_opt_sync_bytes:
if (unlikely(!(env->flags & ENV_ACTIVE)))
return LOG_IFERR(MDBX_EPERM);
*pvalue = pgno2bytes(env, atomic_load32(&env->lck->autosync_threshold, mo_Relaxed));
break;
case MDBX_opt_sync_period:
if (unlikely(!(env->flags & ENV_ACTIVE)))
return LOG_IFERR(MDBX_EPERM);
*pvalue = osal_monotime_to_16dot16(atomic_load64(&env->lck->autosync_period, mo_Relaxed));
break;
case MDBX_opt_max_db:
*pvalue = env->max_dbi - CORE_DBS;
break;
case MDBX_opt_max_readers:
*pvalue = env->max_readers;
break;
case MDBX_opt_dp_reserve_limit:
*pvalue = env->options.dp_reserve_limit;
break;
case MDBX_opt_rp_augment_limit:
*pvalue = env->options.rp_augment_limit;
break;
case MDBX_opt_gc_time_limit:
*pvalue = osal_monotime_to_16dot16(env->options.gc_time_limit);
break;
case MDBX_opt_txn_dp_limit:
*pvalue = env->options.dp_limit;
break;
case MDBX_opt_txn_dp_initial:
*pvalue = env->options.dp_initial;
break;
case MDBX_opt_spill_max_denominator:
*pvalue = env->options.spill_max_denominator;
break;
case MDBX_opt_spill_min_denominator:
*pvalue = env->options.spill_min_denominator;
break;
case MDBX_opt_spill_parent4child_denominator:
*pvalue = env->options.spill_parent4child_denominator;
break;
case MDBX_opt_loose_limit:
*pvalue = env->options.dp_loose_limit;
break;
case MDBX_opt_merge_threshold_16dot16_percent:
*pvalue = env->options.merge_threshold_16dot16_percent;
break;
case MDBX_opt_writethrough_threshold:
#if defined(_WIN32) || defined(_WIN64)
*pvalue = (env->flags & MDBX_NOMETASYNC) ? 0 : INT_MAX;
#else
*pvalue = env->options.writethrough_threshold;
#endif
break;
case MDBX_opt_prefault_write_enable:
*pvalue = env->options.prefault_write;
break;
case MDBX_opt_prefer_waf_insteadof_balance:
*pvalue = env->options.prefer_waf_insteadof_balance;
break;
case MDBX_opt_subpage_limit:
*pvalue = env->options.subpage.limit;
break;
case MDBX_opt_subpage_room_threshold:
*pvalue = env->options.subpage.room_threshold;
break;
case MDBX_opt_subpage_reserve_prereq:
*pvalue = env->options.subpage.reserve_prereq;
break;
case MDBX_opt_subpage_reserve_limit:
*pvalue = env->options.subpage.reserve_limit;
break;
default:
return LOG_IFERR(MDBX_EINVAL);
}
return MDBX_SUCCESS;
}

View File

@ -1,365 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
typedef struct diff_result {
ptrdiff_t diff;
intptr_t level;
ptrdiff_t root_nkeys;
} diff_t;
/* calculates: r = x - y */
__hot static int cursor_diff(const MDBX_cursor *const __restrict x, const MDBX_cursor *const __restrict y,
diff_t *const __restrict r) {
r->diff = 0;
r->level = 0;
r->root_nkeys = 0;
int rc = check_txn(x->txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(x->txn != y->txn))
return MDBX_BAD_TXN;
if (unlikely(y->dbi_state != x->dbi_state))
return MDBX_EINVAL;
const intptr_t depth = (x->top < y->top) ? x->top : y->top;
if (unlikely(depth < 0))
return MDBX_ENODATA;
r->root_nkeys = page_numkeys(x->pg[0]);
intptr_t nkeys = r->root_nkeys;
for (;;) {
if (unlikely(y->pg[r->level] != x->pg[r->level])) {
ERROR("Mismatch cursors's pages at %zu level", r->level);
return MDBX_PROBLEM;
}
r->diff = x->ki[r->level] - y->ki[r->level];
if (r->diff)
break;
r->level += 1;
if (r->level > depth) {
r->diff = CMP2INT(x->flags & z_eof_hard, y->flags & z_eof_hard);
return MDBX_SUCCESS;
}
nkeys = page_numkeys(x->pg[r->level]);
}
while (unlikely(r->diff == 1) && likely(r->level < depth)) {
r->level += 1;
/* DB'PAGEs: 0------------------>MAX
*
* CURSORs: y < x
* STACK[i ]: |
* STACK[+1]: ...y++N|0++x...
*/
nkeys = page_numkeys(y->pg[r->level]);
r->diff = (nkeys - y->ki[r->level]) + x->ki[r->level];
assert(r->diff > 0);
}
while (unlikely(r->diff == -1) && likely(r->level < depth)) {
r->level += 1;
/* DB'PAGEs: 0------------------>MAX
*
* CURSORs: x < y
* STACK[i ]: |
* STACK[+1]: ...x--N|0--y...
*/
nkeys = page_numkeys(x->pg[r->level]);
r->diff = -(nkeys - x->ki[r->level]) - y->ki[r->level];
assert(r->diff < 0);
}
return MDBX_SUCCESS;
}
__hot static ptrdiff_t estimate(const tree_t *tree, diff_t *const __restrict dr) {
/* root: branch-page => scale = leaf-factor * branch-factor^(N-1)
* level-1: branch-page(s) => scale = leaf-factor * branch-factor^2
* level-2: branch-page(s) => scale = leaf-factor * branch-factor
* level-N: branch-page(s) => scale = leaf-factor
* leaf-level: leaf-page(s) => scale = 1
*/
ptrdiff_t btree_power = (ptrdiff_t)tree->height - 2 - (ptrdiff_t)dr->level;
if (btree_power < 0)
return dr->diff;
ptrdiff_t estimated = (ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)tree->leaf_pages;
if (btree_power == 0)
return estimated;
if (tree->height < 4) {
assert(dr->level == 0 && btree_power == 1);
return (ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)dr->root_nkeys;
}
/* average_branchpage_fillfactor = total(branch_entries) / branch_pages
total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */
const size_t log2_fixedpoint = sizeof(size_t) - 1;
const size_t half = UINT64_C(1) << (log2_fixedpoint - 1);
const size_t factor = ((tree->leaf_pages + tree->branch_pages - 1) << log2_fixedpoint) / tree->branch_pages;
while (1) {
switch ((size_t)btree_power) {
default: {
const size_t square = (factor * factor + half) >> log2_fixedpoint;
const size_t quad = (square * square + half) >> log2_fixedpoint;
do {
estimated = estimated * quad + half;
estimated >>= log2_fixedpoint;
btree_power -= 4;
} while (btree_power >= 4);
continue;
}
case 3:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 2:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 1:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 0:
if (unlikely(estimated > (ptrdiff_t)tree->items))
return (ptrdiff_t)tree->items;
if (unlikely(estimated < -(ptrdiff_t)tree->items))
return -(ptrdiff_t)tree->items;
return estimated;
}
}
}
/*------------------------------------------------------------------------------
* Range-Estimation API */
__hot int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last, ptrdiff_t *distance_items) {
if (unlikely(!distance_items))
return LOG_IFERR(MDBX_EINVAL);
int rc = cursor_check_pure(first);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = cursor_check_pure(last);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
*distance_items = 0;
diff_t dr;
rc = cursor_diff(last, first, &dr);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cASSERT(first, dr.diff || inner_pointed(first) == inner_pointed(last));
if (unlikely(dr.diff == 0) && inner_pointed(first)) {
first = &first->subcur->cursor;
last = &last->subcur->cursor;
rc = cursor_diff(first, last, &dr);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
if (likely(dr.diff != 0))
*distance_items = estimate(first->tree, &dr);
return MDBX_SUCCESS;
}
__hot int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op move_op,
ptrdiff_t *distance_items) {
if (unlikely(!distance_items || move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE))
return LOG_IFERR(MDBX_EINVAL);
int rc = cursor_check_ro(cursor);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!is_pointed(cursor)))
return LOG_IFERR(MDBX_ENODATA);
cursor_couple_t next;
rc = cursor_init(&next.outer, cursor->txn, cursor_dbi(cursor));
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_cpstk(cursor, &next.outer);
if (cursor->tree->flags & MDBX_DUPSORT) {
subcur_t *mx = &container_of(cursor, cursor_couple_t, outer)->inner;
cursor_cpstk(&mx->cursor, &next.inner.cursor);
}
MDBX_val stub_data;
if (data == nullptr) {
const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY;
if (unlikely(mask & (1 << move_op)))
return LOG_IFERR(MDBX_EINVAL);
stub_data.iov_base = nullptr;
stub_data.iov_len = 0;
data = &stub_data;
}
MDBX_val stub_key;
if (key == nullptr) {
const unsigned mask =
1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY | 1 << MDBX_SET | 1 << MDBX_SET_RANGE;
if (unlikely(mask & (1 << move_op)))
return LOG_IFERR(MDBX_EINVAL);
stub_key.iov_base = nullptr;
stub_key.iov_len = 0;
key = &stub_key;
}
next.outer.signature = cur_signature_live;
rc = cursor_ops(&next.outer, key, data, move_op);
if (unlikely(rc != MDBX_SUCCESS && (rc != MDBX_NOTFOUND || !is_pointed(&next.outer))))
return LOG_IFERR(rc);
if (move_op == MDBX_LAST) {
next.outer.flags |= z_eof_hard;
next.inner.cursor.flags |= z_eof_hard;
}
return mdbx_estimate_distance(cursor, &next.outer, distance_items);
}
__hot int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *begin_key, const MDBX_val *begin_data,
const MDBX_val *end_key, const MDBX_val *end_data, ptrdiff_t *size_items) {
if (unlikely(!size_items))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(begin_data && (begin_key == nullptr || begin_key == MDBX_EPSILON)))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(end_data && (end_key == nullptr || end_key == MDBX_EPSILON)))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t begin;
/* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */
rc = cursor_init(&begin.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(begin.outer.tree->items == 0)) {
*size_items = 0;
return MDBX_SUCCESS;
}
if (!begin_key) {
if (unlikely(!end_key)) {
/* LY: FIRST..LAST case */
*size_items = (ptrdiff_t)begin.outer.tree->items;
return MDBX_SUCCESS;
}
rc = outer_first(&begin.outer, nullptr, nullptr);
if (unlikely(end_key == MDBX_EPSILON)) {
/* LY: FIRST..+epsilon case */
return LOG_IFERR((rc == MDBX_SUCCESS) ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) : rc);
}
} else {
if (unlikely(begin_key == MDBX_EPSILON)) {
if (end_key == nullptr) {
/* LY: -epsilon..LAST case */
rc = outer_last(&begin.outer, nullptr, nullptr);
return LOG_IFERR((rc == MDBX_SUCCESS) ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) : rc);
}
/* LY: -epsilon..value case */
assert(end_key != MDBX_EPSILON);
begin_key = end_key;
} else if (unlikely(end_key == MDBX_EPSILON)) {
/* LY: value..+epsilon case */
assert(begin_key != MDBX_EPSILON);
end_key = begin_key;
}
if (end_key && !begin_data && !end_data &&
(begin_key == end_key || begin.outer.clc->k.cmp(begin_key, end_key) == 0)) {
/* LY: single key case */
rc = cursor_seek(&begin.outer, (MDBX_val *)begin_key, nullptr, MDBX_SET).err;
if (unlikely(rc != MDBX_SUCCESS)) {
*size_items = 0;
return LOG_IFERR((rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc);
}
*size_items = 1;
if (inner_pointed(&begin.outer))
*size_items = (sizeof(*size_items) >= sizeof(begin.inner.nested_tree.items) ||
begin.inner.nested_tree.items <= PTRDIFF_MAX)
? (size_t)begin.inner.nested_tree.items
: PTRDIFF_MAX;
return MDBX_SUCCESS;
} else {
MDBX_val proxy_key = *begin_key;
MDBX_val proxy_data = {nullptr, 0};
if (begin_data)
proxy_data = *begin_data;
rc = LOG_IFERR(cursor_seek(&begin.outer, &proxy_key, &proxy_data, MDBX_SET_LOWERBOUND).err);
}
}
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !is_pointed(&begin.outer))
return LOG_IFERR(rc);
}
cursor_couple_t end;
rc = cursor_init(&end.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (!end_key) {
rc = outer_last(&end.outer, nullptr, nullptr);
end.outer.flags |= z_eof_hard;
end.inner.cursor.flags |= z_eof_hard;
} else {
MDBX_val proxy_key = *end_key;
MDBX_val proxy_data = {nullptr, 0};
if (end_data)
proxy_data = *end_data;
rc = cursor_seek(&end.outer, &proxy_key, &proxy_data, MDBX_SET_LOWERBOUND).err;
}
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !is_pointed(&end.outer))
return LOG_IFERR(rc);
}
rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
assert(*size_items >= -(ptrdiff_t)begin.outer.tree->items && *size_items <= (ptrdiff_t)begin.outer.tree->items);
#if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation \
* results for an inverted ranges. */
/* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63
Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */
if (*size_items < 0) {
/* LY: inverted range case */
*size_items += (ptrdiff_t)begin.outer.tree->items;
} else if (*size_items == 0 && begin_key && end_key) {
int cmp = begin.outer.kvx->cmp(&origin_begin_key, &origin_end_key);
if (cmp == 0 && cursor_pointed(begin.inner.cursor.flags) &&
begin_data && end_data)
cmp = begin.outer.kvx->v.cmp(&origin_begin_data, &origin_end_data);
if (cmp > 0) {
/* LY: inverted range case with empty scope */
*size_items = (ptrdiff_t)begin.outer.tree->items;
}
}
assert(*size_items >= 0 &&
*size_items <= (ptrdiff_t)begin.outer.tree->items);
#endif
return MDBX_SUCCESS;
}

View File

@ -1,454 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
__cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, uint32_t *mask) {
if (unlikely(!mask))
return LOG_IFERR(MDBX_EINVAL);
*mask = 0;
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if ((cx.outer.tree->flags & MDBX_DUPSORT) == 0)
return MDBX_RESULT_TRUE;
MDBX_val key, data;
rc = outer_first(&cx.outer, &key, &data);
while (rc == MDBX_SUCCESS) {
const node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
const tree_t *db = node_data(node);
const unsigned flags = node_flags(node);
switch (flags) {
case N_BIG:
case 0:
/* single-value entry, deep = 0 */
*mask |= 1 << 0;
break;
case N_DUP:
/* single sub-page, deep = 1 */
*mask |= 1 << 1;
break;
case N_DUP | N_TREE:
/* sub-tree */
*mask |= 1 << UNALIGNED_PEEK_16(db, tree_t, height);
break;
default:
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid node-size", flags);
return LOG_IFERR(MDBX_CORRUPTED);
}
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP);
}
return LOG_IFERR((rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc);
}
int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) {
if (unlikely(canary == nullptr))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
if (unlikely(rc != MDBX_SUCCESS)) {
memset(canary, 0, sizeof(*canary));
return LOG_IFERR(rc);
}
*canary = txn->canary;
return MDBX_SUCCESS;
}
int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) {
DKBUF_DEBUG;
DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
if (unlikely(!key || !data))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
return LOG_IFERR(cursor_seek(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err);
}
int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) {
if (unlikely(!key || !data))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
return LOG_IFERR(cursor_ops(&cx.outer, key, data, MDBX_SET_LOWERBOUND));
}
int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, size_t *values_count) {
DKBUF_DEBUG;
DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
if (unlikely(!key || !data))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = cursor_seek(&cx.outer, key, data, MDBX_SET_KEY).err;
if (unlikely(rc != MDBX_SUCCESS)) {
if (values_count)
*values_count = 0;
return LOG_IFERR(rc);
}
if (values_count) {
*values_count = 1;
if (inner_pointed(&cx.outer))
*values_count =
(sizeof(*values_count) >= sizeof(cx.inner.nested_tree.items) || cx.inner.nested_tree.items <= PTRDIFF_MAX)
? (size_t)cx.inner.nested_tree.items
: PTRDIFF_MAX;
}
return MDBX_SUCCESS;
}
/*----------------------------------------------------------------------------*/
int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) {
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (likely(canary)) {
if (txn->canary.x == canary->x && txn->canary.y == canary->y && txn->canary.z == canary->z)
return MDBX_SUCCESS;
txn->canary.x = canary->x;
txn->canary.y = canary->y;
txn->canary.z = canary->z;
}
txn->canary.v = txn->txnid;
txn->flags |= MDBX_TXN_DIRTY;
return MDBX_SUCCESS;
}
/* Функция сообщает находится ли указанный адрес в "грязной" странице у
* заданной пишущей транзакции. В конечном счете это позволяет избавиться от
* лишнего копирования данных из НЕ-грязных страниц.
*
* "Грязные" страницы - это те, которые уже были изменены в ходе пишущей
* транзакции. Соответственно, какие-либо дальнейшие изменения могут привести
* к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в
* качестве аргументов НЕ должны получать указатели на данные в таких
* страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут
* скопированы.
*
* Другими словами, данные из "грязных" страниц должны быть либо скопированы
* перед передачей в качестве аргументов для дальнейших модификаций, либо
* отвергнуты на стадии проверки корректности аргументов.
*
* Таким образом, функция позволяет как избавится от лишнего копирования,
* так и выполнить более полную проверку аргументов.
*
* ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только
* так гарантируется что актуальный заголовок страницы будет физически
* расположен в той-же странице памяти, в том числе для многостраничных
* P_LARGE страниц с длинными данными. */
int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
const MDBX_env *env = txn->env;
const ptrdiff_t offset = ptr_dist(ptr, env->dxb_mmap.base);
if (offset >= 0) {
const pgno_t pgno = bytes2pgno(env, offset);
if (likely(pgno < txn->geo.first_unallocated)) {
const page_t *page = pgno2page(env, pgno);
if (unlikely(page->pgno != pgno || (page->flags & P_ILL_BITS) != 0)) {
/* The ptr pointed into middle of a large page,
* not to the beginning of a data. */
return LOG_IFERR(MDBX_EINVAL);
}
return ((txn->flags & MDBX_TXN_RDONLY) || !is_modifable(txn, page)) ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE;
}
if ((size_t)offset < env->dxb_mmap.limit) {
/* Указатель адресует что-то в пределах mmap, но за границей
* распределенных страниц. Такое может случится если mdbx_is_dirty()
* вызывается после операции, в ходе которой грязная страница была
* возвращена в нераспределенное пространство. */
return (txn->flags & MDBX_TXN_RDONLY) ? LOG_IFERR(MDBX_EINVAL) : MDBX_RESULT_TRUE;
}
}
/* Страница вне используемого mmap-диапазона, т.е. либо в функцию был
* передан некорректный адрес, либо адрес в теневой странице, которая была
* выделена посредством malloc().
*
* Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная",
* а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */
return (txn->flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? LOG_IFERR(MDBX_EINVAL) : MDBX_RESULT_TRUE;
}
int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data) {
if (unlikely(!key))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(dbi <= FREE_DBI))
return LOG_IFERR(MDBX_BAD_DBI);
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
MDBX_val proxy;
MDBX_cursor_op op = MDBX_SET;
unsigned flags = MDBX_ALLDUPS;
if (data) {
proxy = *data;
data = &proxy;
op = MDBX_GET_BOTH;
flags = 0;
}
rc = cursor_seek(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err;
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
rc = cursor_del(&cx.outer, flags);
txn->cursors[dbi] = cx.outer.next;
return LOG_IFERR(rc);
}
int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, MDBX_put_flags_t flags) {
if (unlikely(!key || !data))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(dbi <= FREE_DBI))
return LOG_IFERR(MDBX_BAD_DBI);
if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND |
MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE)))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(flags & MDBX_MULTIPLE)) {
rc = cursor_check_multiple(&cx.outer, key, data, flags);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
}
if (flags & MDBX_RESERVE) {
if (unlikely(cx.outer.tree->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_INTEGERDUP | MDBX_DUPFIXED)))
return LOG_IFERR(MDBX_INCOMPATIBLE);
data->iov_base = nullptr;
}
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
/* LY: support for update (explicit overwrite) */
if (flags & MDBX_CURRENT) {
rc = cursor_seek(&cx.outer, (MDBX_val *)key, nullptr, MDBX_SET).err;
if (likely(rc == MDBX_SUCCESS) && (txn->dbs[dbi].flags & MDBX_DUPSORT) && (flags & MDBX_ALLDUPS) == 0) {
/* LY: allows update (explicit overwrite) only for unique keys */
node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
if (node_flags(node) & N_DUP) {
tASSERT(txn, inner_pointed(&cx.outer) && cx.outer.subcur->nested_tree.items > 1);
rc = MDBX_EMULTIVAL;
if ((flags & MDBX_NOOVERWRITE) == 0) {
flags -= MDBX_CURRENT;
rc = cursor_del(&cx.outer, MDBX_ALLDUPS);
}
}
}
}
if (likely(rc == MDBX_SUCCESS))
rc = cursor_put_checklen(&cx.outer, key, data, flags);
txn->cursors[dbi] = cx.outer.next;
return LOG_IFERR(rc);
}
//------------------------------------------------------------------------------
/* Позволяет обновить или удалить существующую запись с получением
* в old_data предыдущего значения данных. При этом если new_data равен
* нулю, то выполняется удаление, иначе обновление/вставка.
*
* Текущее значение может находиться в уже измененной (грязной) странице.
* В этом случае страница будет перезаписана при обновлении, а само старое
* значение утрачено. Поэтому исходно в old_data должен быть передан
* дополнительный буфер для копирования старого значения.
* Если переданный буфер слишком мал, то функция вернет -1, установив
* old_data->iov_len в соответствующее значение.
*
* Для не-уникальных ключей также возможен второй сценарий использования,
* когда посредством old_data из записей с одинаковым ключом для
* удаления/обновления выбирается конкретная. Для выбора этого сценария
* во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE.
* Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет
* идентифицировать запрос такого сценария.
*
* Функция может быть замещена соответствующими операциями с курсорами
* после двух доработок (TODO):
* - внешняя аллокация курсоров, в том числе на стеке (без malloc).
* - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE).
*/
int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data,
MDBX_put_flags_t flags, MDBX_preserve_func preserver, void *preserver_context) {
if (unlikely(!key || !old_data || old_data == new_data))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(old_data->iov_base == nullptr && old_data->iov_len))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(new_data == nullptr && (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(dbi <= FREE_DBI))
return LOG_IFERR(MDBX_BAD_DBI);
if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND |
MDBX_APPENDDUP | MDBX_CURRENT)))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cursor_couple_t cx;
rc = cursor_init(&cx.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
cx.outer.next = txn->cursors[dbi];
txn->cursors[dbi] = &cx.outer;
MDBX_val present_key = *key;
if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) {
/* в old_data значение для выбора конкретного дубликата */
if (unlikely(!(txn->dbs[dbi].flags & MDBX_DUPSORT))) {
rc = MDBX_EINVAL;
goto bailout;
}
/* убираем лишний бит, он был признаком запрошенного режима */
flags -= MDBX_NOOVERWRITE;
rc = cursor_seek(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err;
if (rc != MDBX_SUCCESS)
goto bailout;
} else {
/* в old_data буфер для сохранения предыдущего значения */
if (unlikely(new_data && old_data->iov_base == new_data->iov_base))
return LOG_IFERR(MDBX_EINVAL);
MDBX_val present_data;
rc = cursor_seek(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err;
if (unlikely(rc != MDBX_SUCCESS)) {
old_data->iov_base = nullptr;
old_data->iov_len = 0;
if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT))
goto bailout;
} else if (flags & MDBX_NOOVERWRITE) {
rc = MDBX_KEYEXIST;
*old_data = present_data;
goto bailout;
} else {
page_t *page = cx.outer.pg[cx.outer.top];
if (txn->dbs[dbi].flags & MDBX_DUPSORT) {
if (flags & MDBX_CURRENT) {
/* disallow update/delete for multi-values */
node_t *node = page_node(page, cx.outer.ki[cx.outer.top]);
if (node_flags(node) & N_DUP) {
tASSERT(txn, inner_pointed(&cx.outer) && cx.outer.subcur->nested_tree.items > 1);
if (cx.outer.subcur->nested_tree.items > 1) {
rc = MDBX_EMULTIVAL;
goto bailout;
}
}
/* В LMDB флажок MDBX_CURRENT здесь приведет
* к замене данных без учета MDBX_DUPSORT сортировки,
* но здесь это в любом случае допустимо, так как мы
* проверили что для ключа есть только одно значение. */
}
}
if (is_modifable(txn, page)) {
if (new_data && eq_fast(&present_data, new_data)) {
/* если данные совпадают, то ничего делать не надо */
*old_data = *new_data;
goto bailout;
}
rc = preserver ? preserver(preserver_context, old_data, present_data.iov_base, present_data.iov_len)
: MDBX_SUCCESS;
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
} else {
*old_data = present_data;
}
flags |= MDBX_CURRENT;
}
}
if (likely(new_data))
rc = cursor_put_checklen(&cx.outer, key, new_data, flags);
else
rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS);
bailout:
txn->cursors[dbi] = cx.outer.next;
return LOG_IFERR(rc);
}
static int default_value_preserver(void *context, MDBX_val *target, const void *src, size_t bytes) {
(void)context;
if (unlikely(target->iov_len < bytes)) {
target->iov_base = nullptr;
target->iov_len = bytes;
return MDBX_RESULT_TRUE;
}
memcpy(target->iov_base, src, target->iov_len = bytes);
return MDBX_SUCCESS;
}
int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data,
MDBX_put_flags_t flags) {
return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags, default_value_preserver, nullptr);
}

View File

@ -1,540 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
#ifdef __SANITIZE_THREAD__
/* LY: avoid tsan-trap by txn, mm_last_pg and geo.first_unallocated */
__attribute__((__no_sanitize_thread__, __noinline__))
#endif
int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
{
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
if (likely(rc == MDBX_SUCCESS))
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR((rc > 0) ? -rc : rc);
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) {
if (percent)
*percent = (int)((txn->geo.first_unallocated * UINT64_C(100) + txn->geo.end_pgno / 2) / txn->geo.end_pgno);
return 0;
}
txnid_t lag;
troika_t troika = meta_tap(txn->env);
do {
const meta_ptr_t head = meta_recent(txn->env, &troika);
if (percent) {
const pgno_t maxpg = head.ptr_v->geometry.now;
*percent = (int)((head.ptr_v->geometry.first_unallocated * UINT64_C(100) + maxpg / 2) / maxpg);
}
lag = (head.txnid - txn->txnid) / xMDBX_TXNID_STEP;
} while (unlikely(meta_should_retry(txn->env, &troika)));
return (lag > INT_MAX) ? INT_MAX : (int)lag;
}
MDBX_env *mdbx_txn_env(const MDBX_txn *txn) {
if (unlikely(!txn || txn->signature != txn_signature || txn->env->signature.weak != env_signature))
return nullptr;
return txn->env;
}
uint64_t mdbx_txn_id(const MDBX_txn *txn) {
if (unlikely(!txn || txn->signature != txn_signature))
return 0;
return txn->txnid;
}
MDBX_txn_flags_t mdbx_txn_flags(const MDBX_txn *txn) {
STATIC_ASSERT(
(MDBX_TXN_INVALID & (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD |
txn_gc_drained | txn_shrink_allowed | txn_rw_begin_flags | txn_ro_begin_flags)) == 0);
if (unlikely(!txn || txn->signature != txn_signature))
return MDBX_TXN_INVALID;
assert(0 == (int)(txn->flags & MDBX_TXN_INVALID));
MDBX_txn_flags_t flags = txn->flags;
if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->ro.slot &&
safe64_read(&txn->ro.slot->tid) == MDBX_TID_TXN_OUSTED)
flags |= MDBX_TXN_OUSTED;
return flags;
}
int mdbx_txn_reset(MDBX_txn *txn) {
int rc = check_txn(txn, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, false);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
/* This call is only valid for read-only txns */
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
return LOG_IFERR(MDBX_EINVAL);
/* LY: don't close DBI-handles */
rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE);
if (rc == MDBX_SUCCESS) {
tASSERT(txn, txn->signature == txn_signature);
tASSERT(txn, txn->owner == 0);
}
return LOG_IFERR(rc);
}
int mdbx_txn_break(MDBX_txn *txn) {
do {
int rc = check_txn(txn, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
txn->flags |= MDBX_TXN_ERROR;
txn = txn->nested;
} while (txn);
return MDBX_SUCCESS;
}
int mdbx_txn_abort(MDBX_txn *txn) {
int rc = check_txn(txn, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
#if MDBX_TXN_CHECKOWNER
if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) == MDBX_NOSTICKYTHREADS &&
unlikely(txn->owner != osal_thread_self())) {
mdbx_txn_break(txn);
return LOG_IFERR(MDBX_THREAD_MISMATCH);
}
#endif /* MDBX_TXN_CHECKOWNER */
return LOG_IFERR(txn_abort(txn));
}
int mdbx_txn_park(MDBX_txn *txn, bool autounpark) {
STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_ERROR);
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
return LOG_IFERR(MDBX_TXN_INVALID);
if (unlikely((txn->flags & MDBX_TXN_ERROR))) {
rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE);
return LOG_IFERR(rc ? rc : MDBX_OUSTED);
}
return LOG_IFERR(txn_ro_park(txn, autounpark));
}
int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_PARKED + MDBX_TXN_ERROR);
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED - MDBX_TXN_ERROR);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!F_ISSET(txn->flags, MDBX_TXN_RDONLY | MDBX_TXN_PARKED)))
return MDBX_SUCCESS;
rc = txn_ro_unpark(txn);
if (likely(rc != MDBX_OUSTED) || !restart_if_ousted)
return LOG_IFERR(rc);
tASSERT(txn, txn->flags & MDBX_TXN_FINISHED);
rc = txn_renew(txn, MDBX_TXN_RDONLY);
return (rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : LOG_IFERR(rc);
}
int mdbx_txn_renew(MDBX_txn *txn) {
int rc = check_txn(txn, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
rc = check_env(txn->env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
return LOG_IFERR(MDBX_EINVAL);
if (unlikely(txn->owner != 0 || !(txn->flags & MDBX_TXN_FINISHED))) {
rc = mdbx_txn_reset(txn);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
rc = txn_renew(txn, MDBX_TXN_RDONLY);
if (rc == MDBX_SUCCESS) {
tASSERT(txn, txn->owner == (txn->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self());
DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
(txn->flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->env, txn->dbs[MAIN_DBI].root,
txn->dbs[FREE_DBI].root);
}
return LOG_IFERR(rc);
}
int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) {
int rc = check_txn(txn, 0);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
txn->userctx = ctx;
return MDBX_SUCCESS;
}
void *mdbx_txn_get_userctx(const MDBX_txn *txn) { return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->userctx; }
int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret, void *context) {
if (unlikely(!ret))
return LOG_IFERR(MDBX_EINVAL);
*ret = nullptr;
if (unlikely((flags & ~txn_rw_begin_flags) && (parent || (flags & ~txn_ro_begin_flags))))
return LOG_IFERR(MDBX_EINVAL);
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(env->flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */
return LOG_IFERR(MDBX_EACCESS);
/* Reuse preallocated write txn. However, do not touch it until
* txn_renew() succeeds, since it currently may be active. */
MDBX_txn *txn = nullptr;
if (parent) {
/* Nested transactions: Max 1 child, write txns only, no writemap */
rc = check_txn(parent, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(parent->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP))) {
rc = MDBX_BAD_TXN;
if ((parent->flags & MDBX_TXN_RDONLY) == 0) {
ERROR("%s mode is incompatible with nested transactions", "MDBX_WRITEMAP");
rc = MDBX_INCOMPATIBLE;
}
return LOG_IFERR(rc);
}
if (unlikely(parent->env != env))
return LOG_IFERR(MDBX_BAD_TXN);
flags |= parent->flags & (txn_rw_begin_flags | MDBX_TXN_SPILLS | MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP);
rc = txn_nested_create(parent, flags);
txn = parent->nested;
if (unlikely(rc != MDBX_SUCCESS)) {
int err = txn_end(txn, TXN_END_FAIL_BEGIN_NESTED);
return err ? err : rc;
}
if (AUDIT_ENABLED() && ASSERT_ENABLED()) {
txn->signature = txn_signature;
tASSERT(txn, audit_ex(txn, 0, false) == 0);
}
} else {
txn = env->basal_txn;
if (flags & MDBX_TXN_RDONLY) {
txn = txn_alloc(flags, env);
if (unlikely(!txn))
return LOG_IFERR(MDBX_ENOMEM);
}
rc = txn_renew(txn, flags);
if (unlikely(rc != MDBX_SUCCESS)) {
if (txn != env->basal_txn)
osal_free(txn);
return LOG_IFERR(rc);
}
}
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
else if (flags & MDBX_TXN_RDONLY)
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
/* Win32: SRWL flag */ txn_shrink_allowed)) == 0);
else {
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | txn_shrink_allowed | txn_may_have_cursors |
MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0);
assert(!txn->wr.spilled.list && !txn->wr.spilled.least_removed);
}
txn->signature = txn_signature;
txn->userctx = context;
*ret = txn;
DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid,
(flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root,
txn->dbs[FREE_DBI].root);
return MDBX_SUCCESS;
}
static void latency_gcprof(MDBX_commit_latency *latency, const MDBX_txn *txn) {
MDBX_env *const env = txn->env;
if (latency && likely(env->lck) && MDBX_ENABLE_PROFGC) {
pgop_stat_t *const ptr = &env->lck->pgops;
latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter;
latency->gc_prof.work_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic);
latency->gc_prof.work_xtime_cpu = osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu);
latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps;
latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages;
latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt;
latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter;
latency->gc_prof.self_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic);
latency->gc_prof.self_xtime_cpu = osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu);
latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps;
latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages;
latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt;
latency->gc_prof.wloops = ptr->gc_prof.wloops;
latency->gc_prof.coalescences = ptr->gc_prof.coalescences;
latency->gc_prof.wipes = ptr->gc_prof.wipes;
latency->gc_prof.flushes = ptr->gc_prof.flushes;
latency->gc_prof.kicks = ptr->gc_prof.kicks;
latency->gc_prof.pnl_merge_work.time = osal_monotime_to_16dot16(ptr->gc_prof.work.pnl_merge.time);
latency->gc_prof.pnl_merge_work.calls = ptr->gc_prof.work.pnl_merge.calls;
latency->gc_prof.pnl_merge_work.volume = ptr->gc_prof.work.pnl_merge.volume;
latency->gc_prof.pnl_merge_self.time = osal_monotime_to_16dot16(ptr->gc_prof.self.pnl_merge.time);
latency->gc_prof.pnl_merge_self.calls = ptr->gc_prof.self.pnl_merge.calls;
latency->gc_prof.pnl_merge_self.volume = ptr->gc_prof.self.pnl_merge.volume;
if (txn == env->basal_txn)
memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof));
}
}
static void latency_init(MDBX_commit_latency *latency, struct commit_timestamp *ts) {
ts->start = 0;
ts->gc_cpu = 0;
if (latency) {
ts->start = osal_monotime();
memset(latency, 0, sizeof(*latency));
}
ts->prep = ts->gc = ts->audit = ts->write = ts->sync = ts->start;
}
static void latency_done(MDBX_commit_latency *latency, struct commit_timestamp *ts) {
if (latency) {
latency->preparation = (ts->prep > ts->start) ? osal_monotime_to_16dot16(ts->prep - ts->start) : 0;
latency->gc_wallclock = (ts->gc > ts->prep) ? osal_monotime_to_16dot16(ts->gc - ts->prep) : 0;
latency->gc_cputime = ts->gc_cpu ? osal_monotime_to_16dot16(ts->gc_cpu) : 0;
latency->audit = (ts->audit > ts->gc) ? osal_monotime_to_16dot16(ts->audit - ts->gc) : 0;
latency->write = (ts->write > ts->audit) ? osal_monotime_to_16dot16(ts->write - ts->audit) : 0;
latency->sync = (ts->sync > ts->write) ? osal_monotime_to_16dot16(ts->sync - ts->write) : 0;
const uint64_t ts_end = osal_monotime();
latency->ending = (ts_end > ts->sync) ? osal_monotime_to_16dot16(ts_end - ts->sync) : 0;
latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_end - ts->start);
}
}
int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR - MDBX_TXN_PARKED);
struct commit_timestamp ts;
latency_init(latency, &ts);
int rc = check_txn(txn, MDBX_TXN_FINISHED);
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_BAD_TXN && F_ISSET(txn->flags, MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) {
rc = MDBX_RESULT_TRUE;
goto fail;
}
return LOG_IFERR(rc);
}
MDBX_env *const env = txn->env;
if (MDBX_ENV_CHECKPID && unlikely(env->pid != osal_getpid())) {
env->flags |= ENV_FATAL_ERROR;
rc = MDBX_PANIC;
return LOG_IFERR(rc);
}
if (txn->flags & MDBX_TXN_RDONLY) {
if (unlikely(txn->parent || (txn->flags & MDBX_TXN_HAS_CHILD) || txn == env->txn || txn == env->basal_txn)) {
ERROR("attempt to commit %s txn %p", "strange read-only", (void *)txn);
return MDBX_PROBLEM;
}
latency_gcprof(latency, txn);
rc = (txn->flags & MDBX_TXN_ERROR) ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
txn_end(txn, TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE);
goto done;
}
#if MDBX_TXN_CHECKOWNER
if ((txn->flags & MDBX_NOSTICKYTHREADS) && txn == env->basal_txn && unlikely(txn->owner != osal_thread_self())) {
txn->flags |= MDBX_TXN_ERROR;
rc = MDBX_THREAD_MISMATCH;
return LOG_IFERR(rc);
}
#endif /* MDBX_TXN_CHECKOWNER */
if (unlikely(txn->flags & MDBX_TXN_ERROR)) {
rc = MDBX_RESULT_TRUE;
fail:
latency_gcprof(latency, txn);
int err = txn_abort(txn);
if (unlikely(err != MDBX_SUCCESS))
rc = err;
goto done;
}
if (txn->nested) {
rc = mdbx_txn_commit_ex(txn->nested, nullptr);
tASSERT(txn, txn->nested == nullptr);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
if (unlikely(txn != env->txn)) {
ERROR("attempt to commit %s txn %p", "unknown", (void *)txn);
return MDBX_EINVAL;
}
if (txn->parent) {
if (unlikely(txn->parent->nested != txn || txn->parent->env != env)) {
ERROR("attempt to commit %s txn %p", "strange nested", (void *)txn);
return MDBX_PROBLEM;
}
latency_gcprof(latency, txn);
rc = txn_nested_join(txn, latency ? &ts : nullptr);
goto done;
}
rc = txn_basal_commit(txn, latency ? &ts : nullptr);
latency_gcprof(latency, txn);
int end = TXN_END_COMMITTED | TXN_END_UPDATE;
if (unlikely(rc != MDBX_SUCCESS)) {
end = TXN_END_ABORT;
if (rc == MDBX_RESULT_TRUE) {
end = TXN_END_PURE_COMMIT | TXN_END_UPDATE;
rc = MDBX_NOSUCCESS_PURE_COMMIT ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
}
}
int err = txn_end(txn, end);
if (unlikely(err != MDBX_SUCCESS))
rc = err;
done:
latency_done(latency, &ts);
return LOG_IFERR(rc);
}
int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
int rc = check_txn(txn, MDBX_TXN_FINISHED);
if (unlikely(rc != MDBX_SUCCESS))
return LOG_IFERR(rc);
if (unlikely(!info))
return LOG_IFERR(MDBX_EINVAL);
MDBX_env *const env = txn->env;
#if MDBX_ENV_CHECKPID
if (unlikely(env->pid != osal_getpid())) {
env->flags |= ENV_FATAL_ERROR;
return LOG_IFERR(MDBX_PANIC);
}
#endif /* MDBX_ENV_CHECKPID */
info->txn_id = txn->txnid;
info->txn_space_used = pgno2bytes(env, txn->geo.first_unallocated);
if (txn->flags & MDBX_TXN_RDONLY) {
meta_ptr_t head;
uint64_t head_retired;
troika_t troika = meta_tap(env);
do {
/* fetch info from volatile head */
head = meta_recent(env, &troika);
head_retired = unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired);
info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->geometry.now);
info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->geometry.upper);
info->txn_space_leftover = pgno2bytes(env, head.ptr_v->geometry.now - head.ptr_v->geometry.first_unallocated);
} while (unlikely(meta_should_retry(env, &troika)));
info->txn_reader_lag = head.txnid - info->txn_id;
info->txn_space_dirty = info->txn_space_retired = 0;
uint64_t reader_snapshot_pages_retired = 0;
if (txn->ro.slot &&
((txn->flags & MDBX_TXN_PARKED) == 0 || safe64_read(&txn->ro.slot->tid) != MDBX_TID_TXN_OUSTED) &&
head_retired >
(reader_snapshot_pages_retired = atomic_load64(&txn->ro.slot->snapshot_pages_retired, mo_Relaxed))) {
info->txn_space_dirty = info->txn_space_retired =
pgno2bytes(env, (pgno_t)(head_retired - reader_snapshot_pages_retired));
size_t retired_next_reader = 0;
lck_t *const lck = env->lck_mmap.lck;
if (scan_rlt && info->txn_reader_lag > 1 && lck) {
/* find next more recent reader */
txnid_t next_reader = head.txnid;
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
for (size_t i = 0; i < snap_nreaders; ++i) {
retry:
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
jitter4testing(true);
const uint64_t snap_tid = safe64_read(&lck->rdt[i].tid);
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
const uint64_t snap_retired = atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_AcquireRelease);
if (unlikely(snap_retired != atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed)) ||
snap_txnid != safe64_read(&lck->rdt[i].txnid) || snap_tid != safe64_read(&lck->rdt[i].tid))
goto retry;
if (snap_txnid <= txn->txnid) {
retired_next_reader = 0;
break;
}
if (snap_txnid < next_reader && snap_tid >= MDBX_TID_TXN_OUSTED) {
next_reader = snap_txnid;
retired_next_reader = pgno2bytes(
env, (pgno_t)(snap_retired - atomic_load64(&txn->ro.slot->snapshot_pages_retired, mo_Relaxed)));
}
}
}
}
info->txn_space_dirty = retired_next_reader;
}
} else {
info->txn_space_limit_soft = pgno2bytes(env, txn->geo.now);
info->txn_space_limit_hard = pgno2bytes(env, txn->geo.upper);
info->txn_space_retired =
pgno2bytes(env, txn->nested ? (size_t)txn->wr.retired_pages : MDBX_PNL_GETSIZE(txn->wr.retired_pages));
info->txn_space_leftover = pgno2bytes(env, txn->wr.dirtyroom);
info->txn_space_dirty =
pgno2bytes(env, txn->wr.dirtylist ? txn->wr.dirtylist->pages_including_loose
: (txn->wr.writemap_dirty_npages + txn->wr.writemap_spilled_npages));
info->txn_reader_lag = INT64_MAX;
lck_t *const lck = env->lck_mmap.lck;
if (scan_rlt && lck) {
txnid_t oldest_reading = txn->txnid;
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
if (snap_nreaders) {
txn_gc_detent(txn);
oldest_reading = txn->env->gc.detent;
if (oldest_reading == txn->wr.troika.txnid[txn->wr.troika.recent]) {
/* Если самый старый используемый снимок является предыдущим, т. е. непосредственно предшествующим текущей
* транзакции, то просматриваем таблицу читателей чтобы выяснить действительно ли снимок используется
* читателями. */
oldest_reading = txn->txnid;
for (size_t i = 0; i < snap_nreaders; ++i) {
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->env->gc.detent == safe64_read(&lck->rdt[i].txnid)) {
oldest_reading = txn->env->gc.detent;
break;
}
}
}
}
info->txn_reader_lag = txn->txnid - oldest_reading;
}
}
return MDBX_SUCCESS;
}

View File

@ -1,364 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
#ifndef __cplusplus
#ifdef MDBX_HAVE_C11ATOMICS
#define osal_memory_fence(order, write) atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order))
#else /* MDBX_HAVE_C11ATOMICS */
#define osal_memory_fence(order, write) \
do { \
osal_compiler_barrier(); \
if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed : mo_AcquireRelease)) \
osal_memory_barrier(); \
} while (0)
#endif /* MDBX_HAVE_C11ATOMICS */
#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__)
#define atomic_store32(p, value, order) \
({ \
const uint32_t value_to_store = (value); \
atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store, mo_c11_store(order)); \
value_to_store; \
})
#define atomic_load32(p, order) atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order))
#define atomic_store64(p, value, order) \
({ \
const uint64_t value_to_store = (value); \
atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store, mo_c11_store(order)); \
value_to_store; \
})
#define atomic_load64(p, order) atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order))
#endif /* LCC && MDBX_HAVE_C11ATOMICS */
#ifndef atomic_store32
MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(mdbx_atomic_uint32_t *p, const uint32_t value,
enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4);
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order));
#else /* MDBX_HAVE_C11ATOMICS */
if (order != mo_Relaxed)
osal_compiler_barrier();
p->weak = value;
osal_memory_fence(order, true);
#endif /* MDBX_HAVE_C11ATOMICS */
return value;
}
#endif /* atomic_store32 */
#ifndef atomic_load32
MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const volatile mdbx_atomic_uint32_t *p,
enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4);
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p)));
return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order));
#else /* MDBX_HAVE_C11ATOMICS */
osal_memory_fence(order, false);
const uint32_t value = p->weak;
if (order != mo_Relaxed)
osal_compiler_barrier();
return value;
#endif /* MDBX_HAVE_C11ATOMICS */
}
#endif /* atomic_load32 */
/*------------------------------------------------------------------------------
* safe read/write volatile 64-bit fields on 32-bit architectures. */
/* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
* #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */
#ifndef xMDBX_TXNID_STEP
#if MDBX_64BIT_CAS
#define xMDBX_TXNID_STEP 1u
#else
#define xMDBX_TXNID_STEP 2u
#endif
#endif /* xMDBX_TXNID_STEP */
#ifndef atomic_store64
MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(mdbx_atomic_uint64_t *p, const uint64_t value,
enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8);
#if MDBX_64BIT_ATOMIC
#if __GNUC_PREREQ(11, 0)
STATIC_ASSERT(__alignof__(mdbx_atomic_uint64_t) >= sizeof(uint64_t));
#endif /* GNU C >= 11 */
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order));
#else /* MDBX_HAVE_C11ATOMICS */
if (order != mo_Relaxed)
osal_compiler_barrier();
p->weak = value;
osal_memory_fence(order, true);
#endif /* MDBX_HAVE_C11ATOMICS */
#else /* !MDBX_64BIT_ATOMIC */
osal_compiler_barrier();
atomic_store32(&p->low, (uint32_t)value, mo_Relaxed);
jitter4testing(true);
atomic_store32(&p->high, (uint32_t)(value >> 32), order);
jitter4testing(true);
#endif /* !MDBX_64BIT_ATOMIC */
return value;
}
#endif /* atomic_store64 */
#ifndef atomic_load64
MDBX_MAYBE_UNUSED static
#if MDBX_64BIT_ATOMIC
__always_inline
#endif /* MDBX_64BIT_ATOMIC */
uint64_t
atomic_load64(const volatile mdbx_atomic_uint64_t *p, enum mdbx_memory_order order) {
STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8);
#if MDBX_64BIT_ATOMIC
#ifdef MDBX_HAVE_C11ATOMICS
assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p)));
return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order));
#else /* MDBX_HAVE_C11ATOMICS */
osal_memory_fence(order, false);
const uint64_t value = p->weak;
if (order != mo_Relaxed)
osal_compiler_barrier();
return value;
#endif /* MDBX_HAVE_C11ATOMICS */
#else /* !MDBX_64BIT_ATOMIC */
osal_compiler_barrier();
uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32;
jitter4testing(true);
value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease);
jitter4testing(true);
for (;;) {
osal_compiler_barrier();
uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32;
jitter4testing(true);
again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease);
jitter4testing(true);
if (likely(value == again))
return value;
value = again;
}
#endif /* !MDBX_64BIT_ATOMIC */
}
#endif /* atomic_load64 */
MDBX_MAYBE_UNUSED static __always_inline void atomic_yield(void) {
#if defined(_WIN32) || defined(_WIN64)
YieldProcessor();
#elif defined(__ia32__) || defined(__e2k__)
__builtin_ia32_pause();
#elif defined(__ia64__)
#if defined(__HP_cc__) || defined(__HP_aCC__)
_Asm_hint(_HINT_PAUSE);
#else
__asm__ __volatile__("hint @pause");
#endif
#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || defined(__ARM_ARCH_6K__)
#ifdef __CC_ARM
__yield();
#else
__asm__ __volatile__("yield");
#endif
#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && __mips_isa_rev >= 2
__asm__ __volatile__("pause");
#elif defined(__mips) || defined(__mips__) || defined(__mips64) || defined(__mips64__) || defined(_M_MRX000) || \
defined(_MIPS_) || defined(__MWERKS__) || defined(__sgi)
__asm__ __volatile__(".word 0x00000140");
#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
sched_yield();
#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
pthread_yield();
#endif
}
#if MDBX_64BIT_CAS
MDBX_MAYBE_UNUSED static __always_inline bool atomic_cas64(mdbx_atomic_uint64_t *p, uint64_t c, uint64_t v) {
#ifdef MDBX_HAVE_C11ATOMICS
STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t));
assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v);
#elif defined(__GNUC__) || defined(__clang__)
return __sync_bool_compare_and_swap(&p->weak, c, v);
#elif defined(_MSC_VER)
return c == (uint64_t)_InterlockedCompareExchange64((volatile __int64 *)&p->weak, v, c);
#elif defined(__APPLE__)
return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak);
#else
#error FIXME: Unsupported compiler
#endif
}
#endif /* MDBX_64BIT_CAS */
MDBX_MAYBE_UNUSED static __always_inline bool atomic_cas32(mdbx_atomic_uint32_t *p, uint32_t c, uint32_t v) {
#ifdef MDBX_HAVE_C11ATOMICS
STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v);
#elif defined(__GNUC__) || defined(__clang__)
return __sync_bool_compare_and_swap(&p->weak, c, v);
#elif defined(_MSC_VER)
STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
return c == (uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c);
#elif defined(__APPLE__)
return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak);
#else
#error FIXME: Unsupported compiler
#endif
}
MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_add32(mdbx_atomic_uint32_t *p, uint32_t v) {
#ifdef MDBX_HAVE_C11ATOMICS
STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v);
#elif defined(__GNUC__) || defined(__clang__)
return __sync_fetch_and_add(&p->weak, v);
#elif defined(_MSC_VER)
STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v);
#elif defined(__APPLE__)
return OSAtomicAdd32Barrier(v, &p->weak);
#else
#error FIXME: Unsupported compiler
#endif
}
#define atomic_sub32(p, v) atomic_add32(p, 0 - (v))
MDBX_MAYBE_UNUSED static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) {
txnid += xMDBX_TXNID_STEP;
#if !MDBX_64BIT_CAS
/* avoid overflow of low-part in safe64_reset() */
txnid += (UINT32_MAX == (uint32_t)txnid);
#endif
return txnid;
}
/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */
MDBX_MAYBE_UNUSED static __always_inline void safe64_reset(mdbx_atomic_uint64_t *p, bool single_writer) {
if (single_writer) {
#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64
atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
#else
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */
} else {
#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC
/* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */
atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
#elif MDBX_64BIT_CAS
/* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
#else
/* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1
* and overflow was preserved in safe64_txnid_next() */
STATIC_ASSERT(xMDBX_TXNID_STEP > 1);
atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */
}
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
jitter4testing(true);
}
MDBX_MAYBE_UNUSED static __always_inline bool safe64_reset_compare(mdbx_atomic_uint64_t *p, uint64_t compare) {
/* LY: This function is used to reset `txnid` from hsr-handler in case
* the asynchronously cancellation of read transaction. Therefore,
* there may be a collision between the cleanup performed here and
* asynchronous termination and restarting of the read transaction
* in another process/thread. In general we MUST NOT reset the `txnid`
* if a new transaction was started (i.e. if `txnid` was changed). */
#if MDBX_64BIT_CAS
bool rc = atomic_cas64(p, compare, UINT64_MAX);
#else
/* LY: There is no gold ratio here since shared mutex is too costly,
* in such way we must acquire/release it for every update of txnid,
* i.e. twice for each read transaction). */
bool rc = false;
if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare &&
atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) {
if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) != (uint32_t)compare))
atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32));
else
rc = true;
}
#endif /* MDBX_64BIT_CAS */
jitter4testing(true);
return rc;
}
MDBX_MAYBE_UNUSED static __always_inline void safe64_write(mdbx_atomic_uint64_t *p, const uint64_t v) {
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS
atomic_store64(p, v, mo_AcquireRelease);
#else /* MDBX_64BIT_ATOMIC */
osal_compiler_barrier();
/* update low-part but still value >= SAFE64_INVALID_THRESHOLD */
atomic_store32(&p->low, (uint32_t)v, mo_Relaxed);
assert(p->weak >= SAFE64_INVALID_THRESHOLD);
jitter4testing(true);
/* update high-part from SAFE64_INVALID_THRESHOLD to actual value */
atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease);
#endif /* MDBX_64BIT_ATOMIC */
assert(p->weak == v);
jitter4testing(true);
}
MDBX_MAYBE_UNUSED static __always_inline uint64_t safe64_read(const mdbx_atomic_uint64_t *p) {
jitter4testing(true);
uint64_t v;
do
v = atomic_load64(p, mo_AcquireRelease);
while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak));
return v;
}
#if 0 /* unused for now */
MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) {
#if MDBX_WORDBITS >= 64
return v < SAFE64_INVALID_THRESHOLD;
#else
return (v >> 32) != UINT32_MAX;
#endif /* MDBX_WORDBITS */
}
MDBX_MAYBE_UNUSED static __always_inline bool
safe64_is_valid_ptr(const mdbx_atomic_uint64_t *p) {
#if MDBX_64BIT_ATOMIC
return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD;
#else
return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX;
#endif /* MDBX_64BIT_ATOMIC */
}
#endif /* unused for now */
/* non-atomic write with safety for reading a half-updated value */
MDBX_MAYBE_UNUSED static __always_inline void safe64_update(mdbx_atomic_uint64_t *p, const uint64_t v) {
#if MDBX_64BIT_ATOMIC
atomic_store64(p, v, mo_Relaxed);
#else
safe64_reset(p, true);
safe64_write(p, v);
#endif /* MDBX_64BIT_ATOMIC */
}
/* non-atomic increment with safety for reading a half-updated value */
MDBX_MAYBE_UNUSED static
#if MDBX_64BIT_ATOMIC
__always_inline
#endif /* MDBX_64BIT_ATOMIC */
void
safe64_inc(mdbx_atomic_uint64_t *p, const uint64_t v) {
assert(v > 0);
safe64_update(p, safe64_read(p) + v);
}
#endif /* !__cplusplus */

View File

@ -1,97 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
#ifndef MDBX_64BIT_ATOMIC
#error "The MDBX_64BIT_ATOMIC must be defined before"
#endif /* MDBX_64BIT_ATOMIC */
#ifndef MDBX_64BIT_CAS
#error "The MDBX_64BIT_CAS must be defined before"
#endif /* MDBX_64BIT_CAS */
#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>)
#include <cstdatomic>
#define MDBX_HAVE_C11ATOMICS
#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \
!defined(__STDC_NO_ATOMICS__) && \
(__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || !(defined(__GNUC__) || defined(__clang__)))
#include <stdatomic.h>
#define MDBX_HAVE_C11ATOMICS
#elif defined(__GNUC__) || defined(__clang__)
#elif defined(_MSC_VER)
#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
#pragma warning(disable : 4133) /* 'function': incompatible types - from \
'size_t' to 'LONGLONG' */
#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \
'std::size_t', possible loss of data */
#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \
'long', possible loss of data */
#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
#elif defined(__APPLE__)
#include <libkern/OSAtomic.h>
#else
#error FIXME atomic-ops
#endif
typedef enum mdbx_memory_order {
mo_Relaxed,
mo_AcquireRelease
/* , mo_SequentialConsistency */
} mdbx_memory_order_t;
typedef union {
volatile uint32_t weak;
#ifdef MDBX_HAVE_C11ATOMICS
volatile _Atomic uint32_t c11a;
#endif /* MDBX_HAVE_C11ATOMICS */
} mdbx_atomic_uint32_t;
typedef union {
volatile uint64_t weak;
#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
volatile _Atomic uint64_t c11a;
#endif
#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
__anonymous_struct_extension__ struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
mdbx_atomic_uint32_t low, high;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
mdbx_atomic_uint32_t high, low;
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
};
#endif
} mdbx_atomic_uint64_t;
#ifdef MDBX_HAVE_C11ATOMICS
/* Crutches for C11 atomic compiler's bugs */
#if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127
#define MDBX_c11a_ro(type, ptr) (&(ptr)->weak)
#define MDBX_c11a_rw(type, ptr) (&(ptr)->weak)
#elif defined(__clang__) && __clang__ < 8
#define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a)
#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
#else
#define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a)
#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
#endif /* Crutches for C11 atomic compiler's bugs */
#define mo_c11_store(fence) \
(((fence) == mo_Relaxed) ? memory_order_relaxed \
: ((fence) == mo_AcquireRelease) ? memory_order_release \
: memory_order_seq_cst)
#define mo_c11_load(fence) \
(((fence) == mo_Relaxed) ? memory_order_relaxed \
: ((fence) == mo_AcquireRelease) ? memory_order_acquire \
: memory_order_seq_cst)
#endif /* MDBX_HAVE_C11ATOMICS */
#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)

View File

@ -1,107 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
struct audit_ctx {
size_t used;
uint8_t *const done_bitmap;
};
static int audit_dbi(void *ctx, const MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags,
const struct MDBX_stat *stat, MDBX_dbi dbi) {
struct audit_ctx *audit_ctx = ctx;
(void)name;
(void)txn;
(void)flags;
audit_ctx->used += (size_t)stat->ms_branch_pages + (size_t)stat->ms_leaf_pages + (size_t)stat->ms_overflow_pages;
if (dbi)
audit_ctx->done_bitmap[dbi / CHAR_BIT] |= 1 << dbi % CHAR_BIT;
return MDBX_SUCCESS;
}
static size_t audit_db_used(const tree_t *db) {
return db ? (size_t)db->branch_pages + (size_t)db->leaf_pages + (size_t)db->large_pages : 0;
}
__cold static int audit_ex_locked(MDBX_txn *txn, const size_t retired_stored, const bool dont_filter_gc) {
const MDBX_env *const env = txn->env;
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const size_t pending = txn->wr.loose_count + MDBX_PNL_GETSIZE(txn->wr.repnl) +
(MDBX_PNL_GETSIZE(txn->wr.retired_pages) - retired_stored);
cursor_couple_t cx;
int rc = cursor_init(&cx.outer, txn, FREE_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
size_t gc = 0;
MDBX_val key, data;
rc = outer_first(&cx.outer, &key, &data);
while (rc == MDBX_SUCCESS) {
if (unlikely(key.iov_len != sizeof(txnid_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len);
return MDBX_CORRUPTED;
}
const txnid_t id = unaligned_peek_u64(4, key.iov_base);
const size_t len = *(pgno_t *)data.iov_base;
const bool acc = dont_filter_gc || !gc_is_reclaimed(txn, id);
TRACE("%s id %" PRIaTXN " len %zu", acc ? "acc" : "skip", id, len);
if (acc)
gc += len;
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT);
}
tASSERT(txn, rc == MDBX_NOTFOUND);
const size_t done_bitmap_size = (txn->n_dbi + CHAR_BIT - 1) / CHAR_BIT;
if (txn->parent) {
tASSERT(txn, txn->n_dbi == txn->parent->n_dbi && txn->n_dbi == txn->env->txn->n_dbi);
#if MDBX_ENABLE_DBI_SPARSE
tASSERT(txn, txn->dbi_sparse == txn->parent->dbi_sparse && txn->dbi_sparse == txn->env->txn->dbi_sparse);
#endif /* MDBX_ENABLE_DBI_SPARSE */
}
struct audit_ctx ctx = {0, alloca(done_bitmap_size)};
memset(ctx.done_bitmap, 0, done_bitmap_size);
ctx.used =
NUM_METAS + audit_db_used(dbi_dig(txn, FREE_DBI, nullptr)) + audit_db_used(dbi_dig(txn, MAIN_DBI, nullptr));
rc = mdbx_enumerate_tables(txn, audit_dbi, &ctx);
tASSERT(txn, rc == MDBX_SUCCESS);
for (size_t dbi = CORE_DBS; dbi < txn->n_dbi; ++dbi) {
if (ctx.done_bitmap[dbi / CHAR_BIT] & (1 << dbi % CHAR_BIT))
continue;
const tree_t *db = dbi_dig(txn, dbi, nullptr);
if (db)
ctx.used += audit_db_used(db);
else if (dbi_state(txn, dbi))
WARNING("audit %s@%" PRIaTXN ": unable account dbi %zd / \"%.*s\", state 0x%02x", txn->parent ? "nested-" : "",
txn->txnid, dbi, (int)env->kvs[dbi].name.iov_len, (const char *)env->kvs[dbi].name.iov_base,
dbi_state(txn, dbi));
}
if (pending + gc + ctx.used == txn->geo.first_unallocated)
return MDBX_SUCCESS;
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + "
"%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)",
txn->txnid, pending, txn->wr.loose_count, MDBX_PNL_GETSIZE(txn->wr.repnl),
txn->wr.retired_pages ? MDBX_PNL_GETSIZE(txn->wr.retired_pages) : 0, retired_stored);
ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu"
"(gc) + %zu(count) = %zu(total) <> %zu"
"(allocated)",
txn->txnid, pending, gc, ctx.used, pending + gc + ctx.used, (size_t)txn->geo.first_unallocated);
return MDBX_PROBLEM;
}
__cold int audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) {
MDBX_env *const env = txn->env;
int rc = osal_fastmutex_acquire(&env->dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
rc = audit_ex_locked(txn, retired_stored, dont_filter_gc);
ENSURE(txn->env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS);
}
return rc;
}

View File

@ -1,34 +0,0 @@
N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE | MRESIZE |
--|---------|-----------|--------------|----------|-----------|------------|---------|----------|---------|
0 |0000 0001|ALLOC_RSRV |TXN_FINISHED | | |DBI_DIRTY |N_BIG |P_BRANCH | |
1 |0000 0002|ALLOC_UNIMP|TXN_ERROR |REVERSEKEY|N_TREE |DBI_STALE |N_TREE |P_LEAF | |
2 |0000 0004|ALLOC_COLSC|TXN_DIRTY |DUPSORT | |DBI_FRESH |N_DUP |P_LARGE | |
3 |0000 0008|ALLOC_SSCAN|TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | |
4 |0000 0010|ALLOC_FIFO |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | |
5 |0000 0020| |TXN_PARKED |INTEGERDUP|NODUPDATA | | |P_DUPFIX | |
6 |0000 0040| |TXN_AUTOUNPARK|REVERSEDUP|CURRENT |DBI_OLDEN | |P_SUBP | |
7 |0000 0080| |TXN_DRAINED_GC|DB_VALID |ALLDUPS |DBI_LINDO | | | |
8 |0000 0100| _MAY_MOVE |TXN_CURSORS | | | | | | <= |
9 |0000 0200| _MAY_UNMAP| | | | | | | <= |
10|0000 0400| | | | | | | | |
11|0000 0800| | | | | | | | |
12|0000 1000| | | | | | | | |
13|0000 2000|VALIDATION | | | | | |P_SPILLED | |
14|0000 4000|NOSUBDIR | | | | | |P_LOOSE | |
15|0000 8000| | | | | | |P_FROZEN | |
16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE | | |
17|0002 0000|RDONLY |TXN_RDONLY | |APPEND | |APPEND | | <= |
18|0004 0000|NOMETASYNC |TXN_NOMETASYNC|CREATE |APPENDDUP | | | | |
19|0008 0000|WRITEMAP |<= | |MULTIPLE | | | | <= |
20|0010 0000|UTTERLY | | | | | | | <= |
21|0020 0000|NOSTICKYTHR|<= | | | | | | |
22|0040 0000|EXCLUSIVE | | | | | | | |
23|0080 0000|NORDAHEAD | | | | | | | |
24|0100 0000|NOMEMINIT |TXN_PREPARE | | | | | | |
25|0200 0000|COALESCE | | | | | | | |
26|0400 0000|LIFORECLAIM| | | | | | | |
27|0800 0000|PAGEPERTURB| | | | | | | |
28|1000 0000|ENV_TXKEY |TXN_TRY | | | | | | |
29|2000 0000|ENV_ACTIVE | | | | | | | |
30|4000 0000|ACCEDE |SHRINK_ALLOWED|DB_ACCEDE | | | | | |
31|8000 0000|FATAL_ERROR| | | | | | | |

1825
src/chk.c

File diff suppressed because it is too large Load Diff

View File

@ -1,309 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
/*------------------------------------------------------------------------------
* Pack/Unpack 16-bit values for Grow step & Shrink threshold */
MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t me2v(size_t m, size_t e) {
assert(m < 2048 && e < 8);
return (pgno_t)(32768 + ((m + 1) << (e + 8)));
}
MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t v2me(size_t v, size_t e) {
assert(v > (e ? me2v(2047, e - 1) : 32768));
assert(v <= me2v(2047, e));
size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8);
m -= m > 0;
assert(m < 2048 && e < 8);
// f e d c b a 9 8 7 6 5 4 3 2 1 0
// 1 e e e m m m m m m m m m m m 1
const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1));
assert(pv != 65535);
return pv;
}
/* Convert 16-bit packed (exponential quantized) value to number of pages */
pgno_t pv2pages(uint16_t pv) {
if ((pv & 0x8001) != 0x8001)
return pv;
if (pv == 65535)
return 65536;
// f e d c b a 9 8 7 6 5 4 3 2 1 0
// 1 e e e m m m m m m m m m m m 1
return me2v((pv >> 1) & 2047, (pv >> 12) & 7);
}
/* Convert number of pages to 16-bit packed (exponential quantized) value */
uint16_t pages2pv(size_t pages) {
if (pages < 32769 || (pages < 65536 && (pages & 1) == 0))
return (uint16_t)pages;
if (pages <= me2v(2047, 0))
return v2me(pages, 0);
if (pages <= me2v(2047, 1))
return v2me(pages, 1);
if (pages <= me2v(2047, 2))
return v2me(pages, 2);
if (pages <= me2v(2047, 3))
return v2me(pages, 3);
if (pages <= me2v(2047, 4))
return v2me(pages, 4);
if (pages <= me2v(2047, 5))
return v2me(pages, 5);
if (pages <= me2v(2047, 6))
return v2me(pages, 6);
return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533;
}
__cold bool pv2pages_verify(void) {
bool ok = true, dump_translation = false;
for (size_t i = 0; i < 65536; ++i) {
size_t pages = pv2pages(i);
size_t x = pages2pv(pages);
size_t xp = pv2pages(x);
if (pages != xp) {
ERROR("%zu => %zu => %zu => %zu\n", i, pages, x, xp);
ok = false;
} else if (dump_translation && !(x == i || (x % 2 == 0 && x < 65536))) {
DEBUG("%zu => %zu => %zu => %zu\n", i, pages, x, xp);
}
}
return ok;
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION size_t bytes_align2os_bytes(const MDBX_env *env, size_t bytes) {
return ceil_powerof2(bytes, (env->ps > globals.sys_pagesize) ? env->ps : globals.sys_pagesize);
}
MDBX_NOTHROW_PURE_FUNCTION size_t pgno_align2os_bytes(const MDBX_env *env, size_t pgno) {
return ceil_powerof2(pgno2bytes(env, pgno), globals.sys_pagesize);
}
MDBX_NOTHROW_PURE_FUNCTION pgno_t pgno_align2os_pgno(const MDBX_env *env, size_t pgno) {
return bytes2pgno(env, pgno_align2os_bytes(env, pgno));
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION static __always_inline int cmp_int_inline(const size_t expected_alignment, const MDBX_val *a,
const MDBX_val *b) {
if (likely(a->iov_len == b->iov_len)) {
if (sizeof(size_t) > 7 && likely(a->iov_len == 8))
return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base),
unaligned_peek_u64(expected_alignment, b->iov_base));
if (likely(a->iov_len == 4))
return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base),
unaligned_peek_u32(expected_alignment, b->iov_base));
if (sizeof(size_t) < 8 && likely(a->iov_len == 8))
return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base),
unaligned_peek_u64(expected_alignment, b->iov_base));
}
ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP", a->iov_base, a->iov_len, b->iov_base,
b->iov_len);
return 0;
}
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) {
return cmp_int_inline(1, a, b);
}
#ifndef cmp_int_align2
/* Compare two items pointing at 2-byte aligned unsigned int's. */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) {
return cmp_int_inline(2, a, b);
}
#endif /* cmp_int_align2 */
#ifndef cmp_int_align4
/* Compare two items pointing at 4-byte aligned unsigned int's. */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) {
return cmp_int_inline(4, a, b);
}
#endif /* cmp_int_align4 */
/* Compare two items lexically */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lexical(const MDBX_val *a, const MDBX_val *b) {
if (a->iov_len == b->iov_len)
return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0;
const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1;
const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0;
return likely(diff_data) ? diff_data : diff_len;
}
MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned tail3le(const uint8_t *p, size_t l) {
STATIC_ASSERT(sizeof(unsigned) > 2);
// 1: 0 0 0
// 2: 0 1 1
// 3: 0 1 2
return p[0] | p[l >> 1] << 8 | p[l - 1] << 16;
}
/* Compare two items in reverse byte order */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_reverse(const MDBX_val *a, const MDBX_val *b) {
size_t left = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
if (likely(left)) {
const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len);
const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len);
while (left >= sizeof(size_t)) {
pa -= sizeof(size_t);
pb -= sizeof(size_t);
left -= sizeof(size_t);
STATIC_ASSERT(sizeof(size_t) == 4 || sizeof(size_t) == 8);
if (sizeof(size_t) == 4) {
uint32_t xa = unaligned_peek_u32(1, pa);
uint32_t xb = unaligned_peek_u32(1, pb);
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
xa = osal_bswap32(xa);
xb = osal_bswap32(xb);
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
if (xa != xb)
return (xa < xb) ? -1 : 1;
} else {
uint64_t xa = unaligned_peek_u64(1, pa);
uint64_t xb = unaligned_peek_u64(1, pb);
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
xa = osal_bswap64(xa);
xb = osal_bswap64(xb);
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
if (xa != xb)
return (xa < xb) ? -1 : 1;
}
}
if (sizeof(size_t) == 8 && left >= 4) {
pa -= 4;
pb -= 4;
left -= 4;
uint32_t xa = unaligned_peek_u32(1, pa);
uint32_t xb = unaligned_peek_u32(1, pb);
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
xa = osal_bswap32(xa);
xb = osal_bswap32(xb);
#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
if (xa != xb)
return (xa < xb) ? -1 : 1;
}
if (left) {
unsigned xa = tail3le(pa - left, left);
unsigned xb = tail3le(pb - left, left);
if (xa != xb)
return (xa < xb) ? -1 : 1;
}
}
return CMP2INT(a->iov_len, b->iov_len);
}
/* Fast non-lexically comparator */
MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) {
int diff = CMP2INT(a->iov_len, b->iov_len);
return (likely(diff) || a->iov_len == 0) ? diff : memcmp(a->iov_base, b->iov_base, a->iov_len);
}
MDBX_NOTHROW_PURE_FUNCTION __hot bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l) {
if (likely(l > 3)) {
if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9))
return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) |
(unaligned_peek_u32(1, a + l - 4) - unaligned_peek_u32(1, b + l - 4))) == 0;
if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17))
return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) |
(unaligned_peek_u64(1, a + l - 8) - unaligned_peek_u64(1, b + l - 8))) == 0;
return memcmp(a, b, l) == 0;
}
if (likely(l))
return tail3le(a, l) == tail3le(b, l);
return true;
}
int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b) { return eq_fast(a, b) ? 0 : 1; }
int cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b) { return eq_fast(a, b) ? 0 : -1; }
/*----------------------------------------------------------------------------*/
__cold void update_mlcnt(const MDBX_env *env, const pgno_t new_aligned_mlocked_pgno, const bool lock_not_release) {
for (;;) {
const pgno_t mlock_pgno_before = atomic_load32(&env->mlocked_pgno, mo_AcquireRelease);
eASSERT(env, pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before);
eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) == new_aligned_mlocked_pgno);
if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno)
: (mlock_pgno_before <= new_aligned_mlocked_pgno))
break;
if (likely(atomic_cas32(&((MDBX_env *)env)->mlocked_pgno, mlock_pgno_before, new_aligned_mlocked_pgno)))
for (;;) {
mdbx_atomic_uint32_t *const mlcnt = env->lck->mlcnt;
const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed);
const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed);
if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) {
eASSERT(env, lock_not_release);
if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1)))
continue;
}
if (new_aligned_mlocked_pgno == 0 && (snap_locked - snap_unlocked) > 0) {
eASSERT(env, !lock_not_release);
if (unlikely(!atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1)))
continue;
}
NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u", lock_not_release ? "lock" : "unlock",
lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno,
lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before, snap_locked - snap_unlocked,
atomic_load32(mlcnt + 0, mo_Relaxed) - atomic_load32(mlcnt + 1, mo_Relaxed));
return;
}
}
}
__cold void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, const size_t end_bytes) {
if (atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) > aligned_pgno) {
int err = MDBX_ENOSYS;
const size_t munlock_begin = pgno2bytes(env, aligned_pgno);
const size_t munlock_size = end_bytes - munlock_begin;
eASSERT(env, end_bytes % globals.sys_pagesize == 0 && munlock_begin % globals.sys_pagesize == 0 &&
munlock_size % globals.sys_pagesize == 0);
#if defined(_WIN32) || defined(_WIN64)
err = VirtualUnlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size) ? MDBX_SUCCESS : (int)GetLastError();
if (err == ERROR_NOT_LOCKED)
err = MDBX_SUCCESS;
#elif defined(_POSIX_MEMLOCK_RANGE)
err = munlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size) ? errno : MDBX_SUCCESS;
#endif
if (likely(err == MDBX_SUCCESS))
update_mlcnt(env, aligned_pgno, false);
else {
#if defined(_WIN32) || defined(_WIN64)
WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size, err);
#else
WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err);
#endif
}
}
}
__cold void munlock_all(const MDBX_env *env) {
munlock_after(env, 0, bytes_align2os_bytes(env, env->dxb_mmap.current));
}
/*----------------------------------------------------------------------------*/
uint32_t combine_durability_flags(const uint32_t a, const uint32_t b) {
uint32_t r = a | b;
/* avoid false MDBX_UTTERLY_NOSYNC */
if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && !F_ISSET(b, MDBX_UTTERLY_NOSYNC))
r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC;
/* convert DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */
if ((r & (MDBX_WRITEMAP | DEPRECATED_MAPASYNC)) == (MDBX_WRITEMAP | DEPRECATED_MAPASYNC) &&
!F_ISSET(r, MDBX_UTTERLY_NOSYNC))
r = (r - DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC;
/* force MDBX_NOMETASYNC if NOSYNC enabled */
if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC))
r |= MDBX_NOMETASYNC;
assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && !F_ISSET(b, MDBX_UTTERLY_NOSYNC)));
return r;
}

View File

@ -1,507 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL pgno_t pv2pages(uint16_t pv);
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint16_t pages2pv(size_t pages);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool pv2pages_verify(void);
/*------------------------------------------------------------------------------
* Nodes, Keys & Values length limitation factors:
*
* BRANCH_NODE_MAX
* Branch-page must contain at least two nodes, within each a key and a child
* page number. But page can't be split if it contains less that 4 keys,
* i.e. a page should not overflow before adding the fourth key. Therefore,
* at least 3 branch-node should fit in the single branch-page. Further, the
* first node of a branch-page doesn't contain a key, i.e. the first node
* is always require space just for itself. Thus:
* PAGESPACE = pagesize - page_hdr_len;
* BRANCH_NODE_MAX = even_floor(
* (PAGESPACE - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t));
* KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len;
*
* LEAF_NODE_MAX
* Leaf-node must fit into single leaf-page, where a value could be placed on
* a large/overflow page. However, may require to insert a nearly page-sized
* node between two large nodes are already fill-up a page. In this case the
* page must be split to two if some pair of nodes fits on one page, or
* otherwise the page should be split to the THREE with a single node
* per each of ones. Such 1-into-3 page splitting is costly and complex since
* requires TWO insertion into the parent page, that could lead to split it
* and so on up to the root. Therefore double-splitting is avoided here and
* the maximum node size is half of a leaf page space:
* LEAF_NODE_MAX = even_floor(PAGESPACE / 2 - sizeof(indx_t));
* DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - NODESIZE - KEYLEN_MAX;
*
* - Table-node must fit into one leaf-page:
* TABLE_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(tree_t);
*
* - Dupsort values itself are a keys in a dupsort-table and couldn't be longer
* than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX,
* since dupsort value couldn't be placed on a large/overflow page:
* DUPSORT_DATALEN_MAX = min(KEYLEN_MAX,
* max(DATALEN_NO_OVERFLOW, sizeof(tree_t));
*/
#define PAGESPACE(pagesize) ((pagesize) - PAGEHDRSZ)
#define BRANCH_NODE_MAX(pagesize) \
(EVEN_FLOOR((PAGESPACE(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t)))
#define LEAF_NODE_MAX(pagesize) (EVEN_FLOOR(PAGESPACE(pagesize) / 2) - sizeof(indx_t))
#define MAX_GC1OVPAGE(pagesize) (PAGESPACE(pagesize) / sizeof(pgno_t) - 1)
MDBX_NOTHROW_CONST_FUNCTION static inline size_t keysize_max(size_t pagesize, MDBX_db_flags_t flags) {
assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE && is_powerof2(pagesize));
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE >= 8);
if (flags & MDBX_INTEGERKEY)
return 8 /* sizeof(uint64_t) */;
const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE;
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE -
/* sizeof(uint64) as a key */ 8 >
sizeof(tree_t));
if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
const intptr_t max_dupsort_leaf_key = LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(tree_t);
return (max_branch_key < max_dupsort_leaf_key) ? max_branch_key : max_dupsort_leaf_key;
}
return max_branch_key;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t env_keysize_max(const MDBX_env *env, MDBX_db_flags_t flags) {
size_t size_max;
if (flags & MDBX_INTEGERKEY)
size_max = 8 /* sizeof(uint64_t) */;
else {
const intptr_t max_branch_key = env->branch_nodemax - NODESIZE;
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE -
/* sizeof(uint64) as a key */ 8 >
sizeof(tree_t));
if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
const intptr_t max_dupsort_leaf_key = env->leaf_nodemax - NODESIZE - sizeof(tree_t);
size_max = (max_branch_key < max_dupsort_leaf_key) ? max_branch_key : max_dupsort_leaf_key;
} else
size_max = max_branch_key;
}
eASSERT(env, size_max == keysize_max(env->ps, flags));
return size_max;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t keysize_min(MDBX_db_flags_t flags) {
return (flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t valsize_min(MDBX_db_flags_t flags) {
if (flags & MDBX_INTEGERDUP)
return 4 /* sizeof(uint32_t) */;
else if (flags & MDBX_DUPFIXED)
return sizeof(indx_t);
else
return 0;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) {
assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE && is_powerof2(pagesize));
if (flags & MDBX_INTEGERDUP)
return 8 /* sizeof(uint64_t) */;
if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
return keysize_max(pagesize, 0);
const unsigned page_ln2 = log2n_powerof2(pagesize);
const size_t hard = 0x7FF00000ul;
const size_t hard_pages = hard >> page_ln2;
STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO);
const size_t pages_limit = PAGELIST_LIMIT / 4;
const size_t limit = (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2);
return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
}
MDBX_NOTHROW_CONST_FUNCTION static inline size_t env_valsize_max(const MDBX_env *env, MDBX_db_flags_t flags) {
size_t size_max;
if (flags & MDBX_INTEGERDUP)
size_max = 8 /* sizeof(uint64_t) */;
else if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
size_max = env_keysize_max(env, 0);
else {
const size_t hard = 0x7FF00000ul;
const size_t hard_pages = hard >> env->ps2ln;
STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO);
const size_t pages_limit = PAGELIST_LIMIT / 4;
const size_t limit = (hard_pages < pages_limit) ? hard : (pages_limit << env->ps2ln);
size_max = (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
}
eASSERT(env, size_max == valsize_max(env->ps, flags));
return size_max;
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION static inline size_t leaf_size(const MDBX_env *env, const MDBX_val *key,
const MDBX_val *data) {
size_t node_bytes = node_size(key, data);
if (node_bytes > env->leaf_nodemax)
/* put on large/overflow page */
node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t);
return node_bytes + sizeof(indx_t);
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t branch_size(const MDBX_env *env, const MDBX_val *key) {
/* Size of a node in a branch page with a given key.
* This is just the node header plus the key, there is no data. */
size_t node_bytes = node_size(key, nullptr);
if (unlikely(node_bytes > env->branch_nodemax)) {
/* put on large/overflow page, not implemented */
mdbx_panic("node_size(key) %zu > %u branch_nodemax", node_bytes, env->branch_nodemax);
node_bytes = node_size(key, nullptr) + sizeof(pgno_t);
}
return node_bytes + sizeof(indx_t);
}
MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t flags_db2sub(uint16_t db_flags) {
uint16_t sub_flags = db_flags & MDBX_DUPFIXED;
/* MDBX_INTEGERDUP => MDBX_INTEGERKEY */
#define SHIFT_INTEGERDUP_TO_INTEGERKEY 2
STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) == MDBX_INTEGERKEY);
sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY;
/* MDBX_REVERSEDUP => MDBX_REVERSEKEY */
#define SHIFT_REVERSEDUP_TO_REVERSEKEY 5
STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) == MDBX_REVERSEKEY);
sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY;
return sub_flags;
}
static inline bool check_table_flags(unsigned flags) {
switch (flags & ~(MDBX_REVERSEKEY | MDBX_INTEGERKEY)) {
default:
NOTICE("invalid db-flags 0x%x", flags);
return false;
case MDBX_DUPSORT:
case MDBX_DUPSORT | MDBX_REVERSEDUP:
case MDBX_DUPSORT | MDBX_DUPFIXED:
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP:
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP:
case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
case MDBX_DB_DEFAULTS:
return (flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) != (MDBX_REVERSEKEY | MDBX_INTEGERKEY);
}
}
static inline int tbl_setup_ifneed(const MDBX_env *env, volatile kvx_t *const kvx, const tree_t *const db) {
return likely(kvx->clc.v.lmax) ? MDBX_SUCCESS : tbl_setup(env, kvx, db);
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION static inline size_t pgno2bytes(const MDBX_env *env, size_t pgno) {
eASSERT(env, (1u << env->ps2ln) == env->ps);
return ((size_t)pgno) << env->ps2ln;
}
MDBX_NOTHROW_PURE_FUNCTION static inline page_t *pgno2page(const MDBX_env *env, size_t pgno) {
return ptr_disp(env->dxb_mmap.base, pgno2bytes(env, pgno));
}
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) {
eASSERT(env, (env->ps >> env->ps2ln) == 1);
return (pgno_t)(bytes >> env->ps2ln);
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t bytes_align2os_bytes(const MDBX_env *env, size_t bytes);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t pgno_align2os_bytes(const MDBX_env *env, size_t pgno);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL pgno_t pgno_align2os_pgno(const MDBX_env *env, size_t pgno);
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t largechunk_npages(const MDBX_env *env, size_t bytes) {
return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1;
}
MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val get_key(const node_t *node) {
MDBX_val key;
key.iov_len = node_ks(node);
key.iov_base = node_key(node);
return key;
}
static inline void get_key_optional(const node_t *node, MDBX_val *keyptr /* __may_null */) {
if (keyptr)
*keyptr = get_key(node);
}
MDBX_NOTHROW_PURE_FUNCTION static inline void *page_data(const page_t *mp) { return ptr_disp(mp, PAGEHDRSZ); }
MDBX_NOTHROW_PURE_FUNCTION static inline const page_t *data_page(const void *data) {
return container_of(data, page_t, entries);
}
MDBX_NOTHROW_PURE_FUNCTION static inline meta_t *page_meta(page_t *mp) { return (meta_t *)page_data(mp); }
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) {
assert(mp->lower <= mp->upper);
return mp->lower >> 1;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) {
assert(mp->lower <= mp->upper);
return mp->upper - mp->lower;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_space(const MDBX_env *env) {
STATIC_ASSERT(PAGEHDRSZ % 2 == 0);
return env->ps - PAGEHDRSZ;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_used(const MDBX_env *env, const page_t *mp) {
return page_space(env) - page_room(mp);
}
/* The percentage of space used in the page, in a percents. */
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline unsigned page_fill_percentum_x10(const MDBX_env *env,
const page_t *mp) {
const size_t space = page_space(env);
return (unsigned)((page_used(env, mp) * 1000 + space / 2) / space);
}
MDBX_NOTHROW_PURE_FUNCTION static inline node_t *page_node(const page_t *mp, size_t i) {
assert(page_type_compat(mp) == P_LEAF || page_type(mp) == P_BRANCH);
assert(page_numkeys(mp) > i);
assert(mp->entries[i] % 2 == 0);
return ptr_disp(mp, mp->entries[i] + PAGEHDRSZ);
}
MDBX_NOTHROW_PURE_FUNCTION static inline void *page_dupfix_ptr(const page_t *mp, size_t i, size_t keysize) {
assert(page_type_compat(mp) == (P_LEAF | P_DUPFIX) && i == (indx_t)i && mp->dupfix_ksize == keysize);
(void)keysize;
return ptr_disp(mp, PAGEHDRSZ + mp->dupfix_ksize * (indx_t)i);
}
MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val page_dupfix_key(const page_t *mp, size_t i, size_t keysize) {
MDBX_val r;
r.iov_base = page_dupfix_ptr(mp, i, keysize);
r.iov_len = mp->dupfix_ksize;
return r;
}
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b);
#if MDBX_UNALIGNED_OK < 2 || (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG))
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
/* Compare two items pointing at 2-byte aligned unsigned int's. */
cmp_int_align2(const MDBX_val *a, const MDBX_val *b);
#else
#define cmp_int_align2 cmp_int_unaligned
#endif /* !MDBX_UNALIGNED_OK || debug */
#if MDBX_UNALIGNED_OK < 4 || (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG))
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int
/* Compare two items pointing at 4-byte aligned unsigned int's. */
cmp_int_align4(const MDBX_val *a, const MDBX_val *b);
#else
#define cmp_int_align4 cmp_int_unaligned
#endif /* !MDBX_UNALIGNED_OK || debug */
/* Compare two items lexically */
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lexical(const MDBX_val *a, const MDBX_val *b);
/* Compare two items in reverse byte order */
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_reverse(const MDBX_val *a, const MDBX_val *b);
/* Fast non-lexically comparator */
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lenfast(const MDBX_val *a, const MDBX_val *b);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l);
MDBX_NOTHROW_PURE_FUNCTION static inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) {
return unlikely(a->iov_len == b->iov_len) && eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len);
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b);
static inline MDBX_cmp_func *builtin_keycmp(MDBX_db_flags_t flags) {
return (flags & MDBX_REVERSEKEY) ? cmp_reverse : (flags & MDBX_INTEGERKEY) ? cmp_int_align2 : cmp_lexical;
}
static inline MDBX_cmp_func *builtin_datacmp(MDBX_db_flags_t flags) {
return !(flags & MDBX_DUPSORT)
? cmp_lenfast
: ((flags & MDBX_INTEGERDUP) ? cmp_int_unaligned
: ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical));
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL uint32_t combine_durability_flags(const uint32_t a, const uint32_t b);
MDBX_CONST_FUNCTION static inline lck_t *lckless_stub(const MDBX_env *env) {
uintptr_t stub = (uintptr_t)&env->lckless_placeholder;
/* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */
stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1);
return (lck_t *)stub;
}
#if !(defined(_WIN32) || defined(_WIN64))
MDBX_CONST_FUNCTION static inline int ignore_enosys(int err) {
#ifdef ENOSYS
if (err == ENOSYS)
return MDBX_RESULT_TRUE;
#endif /* ENOSYS */
#ifdef ENOIMPL
if (err == ENOIMPL)
return MDBX_RESULT_TRUE;
#endif /* ENOIMPL */
#ifdef ENOTSUP
if (err == ENOTSUP)
return MDBX_RESULT_TRUE;
#endif /* ENOTSUP */
#ifdef ENOSUPP
if (err == ENOSUPP)
return MDBX_RESULT_TRUE;
#endif /* ENOSUPP */
#ifdef EOPNOTSUPP
if (err == EOPNOTSUPP)
return MDBX_RESULT_TRUE;
#endif /* EOPNOTSUPP */
return err;
}
MDBX_MAYBE_UNUSED MDBX_CONST_FUNCTION static inline int ignore_enosys_and_eagain(int err) {
return (err == EAGAIN) ? MDBX_RESULT_TRUE : ignore_enosys(err);
}
MDBX_MAYBE_UNUSED MDBX_CONST_FUNCTION static inline int ignore_enosys_and_einval(int err) {
return (err == EINVAL) ? MDBX_RESULT_TRUE : ignore_enosys(err);
}
#endif /* defined(_WIN32) || defined(_WIN64) */
static inline int check_env(const MDBX_env *env, const bool wanna_active) {
if (unlikely(!env))
return MDBX_EINVAL;
if (unlikely(env->signature.weak != env_signature))
return MDBX_EBADSIGN;
if (unlikely(env->flags & ENV_FATAL_ERROR))
return MDBX_PANIC;
if (wanna_active) {
#if MDBX_ENV_CHECKPID
if (unlikely(env->pid != osal_getpid()) && env->pid) {
((MDBX_env *)env)->flags |= ENV_FATAL_ERROR;
return MDBX_PANIC;
}
#endif /* MDBX_ENV_CHECKPID */
if (unlikely((env->flags & ENV_ACTIVE) == 0))
return MDBX_EPERM;
eASSERT(env, env->dxb_mmap.base != nullptr);
}
return MDBX_SUCCESS;
}
static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) {
if (unlikely(!txn))
return MDBX_EINVAL;
if (unlikely(txn->signature != txn_signature))
return MDBX_EBADSIGN;
if (bad_bits) {
if (unlikely(!txn->env->dxb_mmap.base))
return MDBX_EPERM;
if (unlikely(txn->flags & bad_bits)) {
if ((bad_bits & MDBX_TXN_RDONLY) && unlikely(txn->flags & MDBX_TXN_RDONLY))
return MDBX_EACCESS;
if ((bad_bits & MDBX_TXN_PARKED) == 0)
return MDBX_BAD_TXN;
return txn_check_badbits_parked(txn, bad_bits);
}
}
tASSERT(txn, (txn->flags & MDBX_TXN_FINISHED) ||
(txn->flags & MDBX_NOSTICKYTHREADS) == (txn->env->flags & MDBX_NOSTICKYTHREADS));
#if MDBX_TXN_CHECKOWNER
if ((txn->flags & (MDBX_NOSTICKYTHREADS | MDBX_TXN_FINISHED)) != MDBX_NOSTICKYTHREADS &&
!(bad_bits /* abort/reset/txn-break */ == 0 &&
((txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)) == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED))) &&
unlikely(txn->owner != osal_thread_self()))
return txn->owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN;
#endif /* MDBX_TXN_CHECKOWNER */
return MDBX_SUCCESS;
}
static inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) {
return check_txn(txn, (bad_bits | MDBX_TXN_RDONLY) & ~MDBX_TXN_PARKED);
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL void mincore_clean_cache(const MDBX_env *const env);
MDBX_INTERNAL void update_mlcnt(const MDBX_env *env, const pgno_t new_aligned_mlocked_pgno,
const bool lock_not_release);
MDBX_INTERNAL void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, const size_t end_bytes);
MDBX_INTERNAL void munlock_all(const MDBX_env *env);
/*----------------------------------------------------------------------------*/
/* Cache coherence and mmap invalidation */
#ifndef MDBX_CPU_WRITEBACK_INCOHERENT
#error "The MDBX_CPU_WRITEBACK_INCOHERENT must be defined before"
#elif MDBX_CPU_WRITEBACK_INCOHERENT
#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier()
#else
#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier()
#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
MDBX_MAYBE_UNUSED static inline void osal_flush_incoherent_mmap(const void *addr, size_t nbytes,
const intptr_t pagesize) {
#ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE
#error "The MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined before"
#elif MDBX_MMAP_INCOHERENT_FILE_WRITE
char *const begin = (char *)(-pagesize & (intptr_t)addr);
char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1));
int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0;
eASSERT(nullptr, err == 0);
(void)err;
#else
(void)pagesize;
#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
#ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE
#error "The MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined before"
#elif MDBX_MMAP_INCOHERENT_CPU_CACHE
#ifdef DCACHE
/* MIPS has cache coherency issues.
* Note: for any nbytes >= on-chip cache size, entire is flushed. */
cacheflush((void *)addr, nbytes, DCACHE);
#else
#error "Oops, cacheflush() not available"
#endif /* DCACHE */
#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
#if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE
(void)addr;
(void)nbytes;
#endif
}

View File

@ -1,170 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
/* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */
static bool coherency_check(const MDBX_env *env, const txnid_t txnid, const volatile tree_t *trees,
const volatile meta_t *meta, bool report) {
const txnid_t freedb_mod_txnid = trees[FREE_DBI].mod_txnid;
const txnid_t maindb_mod_txnid = trees[MAIN_DBI].mod_txnid;
const pgno_t last_pgno = meta->geometry.now;
const pgno_t freedb_root_pgno = trees[FREE_DBI].root;
const page_t *freedb_root =
(env->dxb_mmap.base && freedb_root_pgno < last_pgno) ? pgno2page(env, freedb_root_pgno) : nullptr;
const pgno_t maindb_root_pgno = trees[MAIN_DBI].root;
const page_t *maindb_root =
(env->dxb_mmap.base && maindb_root_pgno < last_pgno) ? pgno2page(env, maindb_root_pgno) : nullptr;
const uint64_t magic_and_version = unaligned_peek_u64_volatile(4, &meta->magic_and_version);
bool ok = true;
if (freedb_root_pgno != P_INVALID && unlikely(freedb_root_pgno >= last_pgno)) {
if (report)
WARNING("catch invalid %s-db root %" PRIaPGNO " for meta_txnid %" PRIaTXN " %s", "free", freedb_root_pgno, txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
if (maindb_root_pgno != P_INVALID && unlikely(maindb_root_pgno >= last_pgno)) {
if (report)
WARNING("catch invalid %s-db root %" PRIaPGNO " for meta_txnid %" PRIaTXN " %s", "main", maindb_root_pgno, txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
if (unlikely(txnid < freedb_mod_txnid ||
(!freedb_mod_txnid && freedb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) {
if (report)
WARNING(
"catch invalid %s-db.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", "free", freedb_mod_txnid, txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of unified page/buffer cache)" : "(wagering meta)");
ok = false;
}
if (unlikely(txnid < maindb_mod_txnid ||
(!maindb_mod_txnid && maindb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) {
if (report)
WARNING(
"catch invalid %s-db.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", "main", maindb_mod_txnid, txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of unified page/buffer cache)" : "(wagering meta)");
ok = false;
}
/* Проверяем отметки внутри корневых страниц только если сами страницы
* в пределах текущего отображения. Иначе возможны SIGSEGV до переноса
* вызова coherency_check_head() после dxb_resize() внутри txn_renew(). */
if (likely(freedb_root && freedb_mod_txnid &&
(size_t)ptr_dist(env->dxb_mmap.base, freedb_root) < env->dxb_mmap.limit)) {
VALGRIND_MAKE_MEM_DEFINED(freedb_root, sizeof(freedb_root->txnid));
MDBX_ASAN_UNPOISON_MEMORY_REGION(freedb_root, sizeof(freedb_root->txnid));
const txnid_t root_txnid = freedb_root->txnid;
if (unlikely(root_txnid != freedb_mod_txnid)) {
if (report)
WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN " for %s-db.mod_txnid %" PRIaTXN " %s",
freedb_root_pgno, root_txnid, "free", freedb_mod_txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of "
"unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
}
if (likely(maindb_root && maindb_mod_txnid &&
(size_t)ptr_dist(env->dxb_mmap.base, maindb_root) < env->dxb_mmap.limit)) {
VALGRIND_MAKE_MEM_DEFINED(maindb_root, sizeof(maindb_root->txnid));
MDBX_ASAN_UNPOISON_MEMORY_REGION(maindb_root, sizeof(maindb_root->txnid));
const txnid_t root_txnid = maindb_root->txnid;
if (unlikely(root_txnid != maindb_mod_txnid)) {
if (report)
WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN " for %s-db.mod_txnid %" PRIaTXN " %s",
maindb_root_pgno, root_txnid, "main", maindb_mod_txnid,
(env->stuck_meta < 0) ? "(workaround for incoherent flaw of "
"unified page/buffer cache)"
: "(wagering meta)");
ok = false;
}
}
if (unlikely(!ok) && report)
env->lck->pgops.incoherence.weak =
(env->lck->pgops.incoherence.weak >= INT32_MAX) ? INT32_MAX : env->lck->pgops.incoherence.weak + 1;
return ok;
}
__cold int coherency_timeout(uint64_t *timestamp, intptr_t pgno, const MDBX_env *env) {
if (likely(timestamp && *timestamp == 0))
*timestamp = osal_monotime();
else if (unlikely(!timestamp || osal_monotime() - *timestamp > osal_16dot16_to_monotime(65536 / 10))) {
if (pgno >= 0 && pgno != env->stuck_meta)
ERROR("bailout waiting for %" PRIuSIZE " page arrival %s", pgno,
"(workaround for incoherent flaw of unified page/buffer cache)");
else if (env->stuck_meta < 0)
ERROR("bailout waiting for valid snapshot (%s)", "workaround for incoherent flaw of unified page/buffer cache");
return MDBX_PROBLEM;
}
osal_memory_fence(mo_AcquireRelease, true);
#if defined(_WIN32) || defined(_WIN64)
SwitchToThread();
#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
sched_yield();
#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
pthread_yield();
#else
usleep(42);
#endif
return MDBX_RESULT_TRUE;
}
/* check with timeout as the workaround
* for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */
__hot int coherency_fetch_head(MDBX_txn *txn, const meta_ptr_t head, uint64_t *timestamp) {
/* Copy the DB info and flags */
txn->txnid = head.txnid;
txn->geo = head.ptr_c->geometry;
memcpy(txn->dbs, &head.ptr_c->trees, sizeof(head.ptr_c->trees));
STATIC_ASSERT(sizeof(head.ptr_c->trees) == CORE_DBS * sizeof(tree_t));
VALGRIND_MAKE_MEM_UNDEFINED(txn->dbs + CORE_DBS, txn->env->max_dbi - CORE_DBS);
txn->canary = head.ptr_c->canary;
if (unlikely(!coherency_check(txn->env, head.txnid, txn->dbs, head.ptr_v, *timestamp == 0) ||
txn->txnid != meta_txnid(head.ptr_v)))
return coherency_timeout(timestamp, -1, txn->env);
if (unlikely(txn->dbs[FREE_DBI].flags != MDBX_INTEGERKEY)) {
if ((txn->dbs[FREE_DBI].flags & DB_PERSISTENT_FLAGS) != MDBX_INTEGERKEY ||
unaligned_peek_u64(4, &head.ptr_c->magic_and_version) == MDBX_DATA_MAGIC) {
ERROR("unexpected/invalid db-flags 0x%x for %s", txn->dbs[FREE_DBI].flags, "GC/FreeDB");
return MDBX_INCOMPATIBLE;
}
txn->dbs[FREE_DBI].flags &= DB_PERSISTENT_FLAGS;
}
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
tASSERT(txn, check_table_flags(txn->dbs[MAIN_DBI].flags));
return MDBX_SUCCESS;
}
int coherency_check_written(const MDBX_env *env, const txnid_t txnid, const volatile meta_t *meta, const intptr_t pgno,
uint64_t *timestamp) {
const bool report = !(timestamp && *timestamp);
const txnid_t head_txnid = meta_txnid(meta);
if (likely(head_txnid >= MIN_TXNID && head_txnid >= txnid)) {
if (likely(coherency_check(env, head_txnid, &meta->trees.gc, meta, report))) {
eASSERT(env, meta->trees.gc.flags == MDBX_INTEGERKEY);
eASSERT(env, check_table_flags(meta->trees.main.flags));
return MDBX_SUCCESS;
}
} else if (report) {
env->lck->pgops.incoherence.weak =
(env->lck->pgops.incoherence.weak >= INT32_MAX) ? INT32_MAX : env->lck->pgops.incoherence.weak + 1;
WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s",
(head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid,
bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)),
"(workaround for incoherent flaw of unified page/buffer cache)");
}
return coherency_timeout(timestamp, pgno, env);
}
bool coherency_check_meta(const MDBX_env *env, const volatile meta_t *meta, bool report) {
uint64_t timestamp = 0;
return coherency_check_written(env, 0, meta, -1, report ? &timestamp : nullptr) == MDBX_SUCCESS;
}

View File

@ -5,21 +5,17 @@
/* clang-format off */ /* clang-format off */
#cmakedefine LTO_ENABLED #cmakedefine LTO_ENABLED
#cmakedefine ENABLE_MEMCHECK #cmakedefine MDBX_USE_VALGRIND
#cmakedefine ENABLE_GPROF #cmakedefine ENABLE_GPROF
#cmakedefine ENABLE_GCOV #cmakedefine ENABLE_GCOV
#cmakedefine ENABLE_ASAN #cmakedefine ENABLE_ASAN
#cmakedefine ENABLE_UBSAN #cmakedefine MDBX_FORCE_ASSERTIONS
#cmakedefine01 MDBX_FORCE_ASSERTIONS
#if !defined(MDBX_BUILD_TEST) && !defined(MDBX_BUILD_CXX)
#cmakedefine01 MDBX_BUILD_CXX
#endif
/* Common */ /* Common */
#cmakedefine01 MDBX_TXN_CHECKOWNER #cmakedefine01 MDBX_TXN_CHECKOWNER
#cmakedefine MDBX_ENV_CHECKPID_AUTO #cmakedefine MDBX_TXN_CHECKPID_AUTO
#ifndef MDBX_ENV_CHECKPID_AUTO #ifndef MDBX_TXN_CHECKPID_AUTO
#cmakedefine01 MDBX_ENV_CHECKPID #cmakedefine01 MDBX_TXN_CHECKPID
#endif #endif
#cmakedefine MDBX_LOCKING_AUTO #cmakedefine MDBX_LOCKING_AUTO
#ifndef MDBX_LOCKING_AUTO #ifndef MDBX_LOCKING_AUTO
@ -29,59 +25,27 @@
#ifndef MDBX_TRUST_RTC_AUTO #ifndef MDBX_TRUST_RTC_AUTO
#cmakedefine01 MDBX_TRUST_RTC #cmakedefine01 MDBX_TRUST_RTC
#endif #endif
#cmakedefine01 MDBX_DISABLE_VALIDATION
#cmakedefine01 MDBX_AVOID_MSYNC
#cmakedefine01 MDBX_ENABLE_REFUND
#cmakedefine01 MDBX_ENABLE_BIGFOOT
#cmakedefine01 MDBX_ENABLE_PGOP_STAT
#cmakedefine01 MDBX_ENABLE_PROFGC
#cmakedefine01 MDBX_ENABLE_DBI_SPARSE
#cmakedefine01 MDBX_ENABLE_DBI_LOCKFREE
/* Windows */ /* Windows */
#if defined(MDBX_BUILD_TEST) || !defined(MDBX_BUILD_CXX) || MDBX_BUILD_CXX #cmakedefine01 MDBX_CONFIG_MANUAL_TLS_CALLBACK
#define MDBX_WITHOUT_MSVC_CRT 0 #cmakedefine01 MDBX_AVOID_CRT
#else
#cmakedefine01 MDBX_WITHOUT_MSVC_CRT
#endif /* MDBX_WITHOUT_MSVC_CRT */
/* MacOS & iOS */ /* MacOS & iOS */
#cmakedefine01 MDBX_APPLE_SPEED_INSTEADOF_DURABILITY #cmakedefine01 MDBX_OSX_SPEED_INSTEADOF_DURABILITY
/* POSIX */ /* POSIX */
#cmakedefine01 MDBX_DISABLE_GNU_SOURCE #cmakedefine01 MDBX_DISABLE_GNU_SOURCE
#cmakedefine MDBX_USE_OFDLOCKS_AUTO #cmakedefine MDBX_USE_OFDLOCKS_AUTO
#ifndef MDBX_USE_OFDLOCKS_AUTO #ifndef MDBX_USE_OFDLOCKS_AUTO
#cmakedefine01 MDBX_USE_OFDLOCKS #cmakedefine01 MDBX_USE_OFDLOCKS
#endif /* MDBX_USE_OFDLOCKS */ #endif
#cmakedefine MDBX_MMAP_NEEDS_JOLT_AUTO
#ifndef MDBX_MMAP_NEEDS_JOLT_AUTO
#cmakedefine01 MDBX_MMAP_NEEDS_JOLT
#endif /* MDBX_MMAP_NEEDS_JOLT */
#cmakedefine01 MDBX_USE_MINCORE
/* Build Info */ /* Build Info */
#ifndef MDBX_BUILD_TIMESTAMP
#cmakedefine MDBX_BUILD_TIMESTAMP "@MDBX_BUILD_TIMESTAMP@" #cmakedefine MDBX_BUILD_TIMESTAMP "@MDBX_BUILD_TIMESTAMP@"
#endif
#ifndef MDBX_BUILD_TARGET
#cmakedefine MDBX_BUILD_TARGET "@MDBX_BUILD_TARGET@" #cmakedefine MDBX_BUILD_TARGET "@MDBX_BUILD_TARGET@"
#endif
#ifndef MDBX_BUILD_TYPE
#cmakedefine MDBX_BUILD_TYPE "@MDBX_BUILD_TYPE@" #cmakedefine MDBX_BUILD_TYPE "@MDBX_BUILD_TYPE@"
#endif
#ifndef MDBX_BUILD_COMPILER
#cmakedefine MDBX_BUILD_COMPILER "@MDBX_BUILD_COMPILER@" #cmakedefine MDBX_BUILD_COMPILER "@MDBX_BUILD_COMPILER@"
#endif
#ifndef MDBX_BUILD_FLAGS
#cmakedefine MDBX_BUILD_FLAGS "@MDBX_BUILD_FLAGS@" #cmakedefine MDBX_BUILD_FLAGS "@MDBX_BUILD_FLAGS@"
#endif
#ifndef MDBX_BUILD_METADATA
#cmakedefine MDBX_BUILD_METADATA "@MDBX_BUILD_METADATA@"
#endif
#cmakedefine MDBX_BUILD_SOURCERY @MDBX_BUILD_SOURCERY@ #cmakedefine MDBX_BUILD_SOURCERY @MDBX_BUILD_SOURCERY@
/* *INDENT-ON* */ /* *INDENT-ON* */

18918
src/core.c Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,391 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
/* Состояние курсора.
*
* плохой/poor:
* - неустановленный курсор с незаполненым стеком;
* - следует пропускать во всех циклах отслеживания/корректировки
* позиций курсоров;
* - допускаются только операции предполагающие установку абсолютной позиции;
* - в остальных случаях возвращается ENODATA.
*
* У таких курсоров top = -1 и flags < 0, что позволяет дешево проверять и
* пропускать такие курсоры в циклах отслеживания/корректировки по условию
* probe_cursor->top < this_cursor->top.
*
* пустой/hollow:
* - частично инициализированный курсор, но без доступной пользователю позиции,
* поэтому нельзя выполнить какую-либо операцию без абсолютного (не
* относительного) позиционирования;
* - ki[top] может быть некорректным, в том числе >= page_numkeys(pg[top]).
*
* У таких курсоров top >= 0, но flags < 0 (есть флажок z_hollow).
*
* установленный/pointed:
* - полностью инициализированный курсор с конкретной позицией с данными;
* - можно прочитать текущую строку, удалить её, либо выполнить
* относительное перемещение;
* - может иметь флажки z_after_delete, z_eof_hard и z_eof_soft;
* - наличие z_eof_soft означает что курсор перемещен за пределы данных,
* поэтому нелья прочитать текущие данные, либо удалить их.
*
* У таких курсоров top >= 0 и flags >= 0 (нет флажка z_hollow).
*
* наполненный данными/filled:
* - это установленный/pointed курсор без флагов z_eof_soft;
* - за курсором есть даные, возможны CRUD операции в текущей позиции.
*
* У таких курсоров top >= 0 и (unsigned)flags < z_eof_soft.
*
* Изменения состояния.
*
* - Сбрасывается состояние курсора посредством top_and_flags |= z_poor_mark,
* что равносильно top = -1 вместе с flags |= z_poor_mark;
* - При позиционировании курсора сначала устанавливается top, а flags
* только в самом конце при отсутстви ошибок.
* - Повторное позиционирование first/last может начинаться
* с установки/обнуления только top без сброса flags, что позволяет работать
* быстрому пути внутри tree_search_finalize().
*
* - Заморочки с концом данных:
* - mdbx_cursor_get(NEXT) выполняет две операции (перемещение и чтение),
* поэтому перемещение на последнюю строку строку всегда успешно,
* а ошибка возвращается только при последующем next().
* Однако, из-за этой двойственности семантика ситуации возврата ошибки
* из mdbx_cursor_get(NEXT) допускает разночтение/неопределенность, ибо
* не понятно к чему относится ошибка:
* - Если к чтению данных, то курсор перемещен и стоит после последней
* строки. Соответственно, чтение в текущей позиции запрещено,
* а при выполнении prev() курсор вернется на последнюю строку;
* - Если же ошибка относится к перемещению, то курсор не перемещен и
* остается на последней строке. Соответственно, чтение в текущей
* позиции допустимо, а при выполнении prev() курсор встанет
* на пред-последнюю строку.
* - Пикантность в том, что пользователи (так или иначе) полагаются
* на оба варианта поведения, при этом конечно ожидают что после
* ошибки MDBX_NEXT функция mdbx_cursor_eof() будет возвращать true.
* - далее добавляется схожая ситуация с MDBX_GET_RANGE, MDBX_LOWERBOUND,
* MDBX_GET_BOTH_RANGE и MDBX_UPPERBOUND. Тут при неуспехе поиска курсор
* может/должен стоять после последней строки.
* - далее добавляется MDBX_LAST. Тут курсор должен стоять на последней
* строке и допускать чтение в текузщей позиции,
* но mdbx_cursor_eof() должен возвращать true.
*
* Решение = делаем два флажка z_eof_soft и z_eof_hard:
* - Когда установлен только z_eof_soft,
* функция mdbx_cursor_eof() возвращает true, но допускается
* чтение данных в текущей позиции, а prev() передвигает курсор
* на пред-последнюю строку.
* - Когда установлен z_eof_hard, чтение данных в текущей позиции
* не допускается, и mdbx_cursor_eof() также возвращает true,
* а prev() устанавливает курсора на последюю строку. */
enum cursor_state {
/* Это вложенный курсор для вложенного дерева/страницы и является
inner-элементом struct cursor_couple. */
z_inner = 0x01,
/* Происходит подготовка к обновлению GC,
поэтому можно брать страницы из GC даже для FREE_DBI. */
z_gcu_preparation = 0x02,
/* Курсор только-что создан, поэтому допускается авто-установка
в начало/конец, вместо возврата ошибки. */
z_fresh = 0x04,
/* Предыдущей операцией было удаление, поэтому курсор уже физически указывает
на следующий элемент и соответствующая операция перемещения должна
игнорироваться. */
z_after_delete = 0x08,
/* */
z_disable_tree_search_fastpath = 0x10,
/* Курсор логически в конце данных, но физически на последней строке,
* ki[top] == page_numkeys(pg[top]) - 1 и читать данные в текущей позиции. */
z_eof_soft = 0x20,
/* Курсор логически за концом данных, поэтому следующий переход "назад"
должен игнорироваться и/или приводить к установке на последнюю строку.
В текущем же состоянии нельзя делать CRUD операции. */
z_eof_hard = 0x40,
/* За курсором нет данных, логически его позиция не определена,
нельзя делать CRUD операции в текущей позиции.
Относительное перемещение запрещено. */
z_hollow = -128 /* 0x80 */,
/* Маски для сброса/установки состояния. */
z_clear_mask = z_inner | z_gcu_preparation,
z_poor_mark = z_eof_hard | z_hollow | z_disable_tree_search_fastpath,
z_fresh_mark = z_poor_mark | z_fresh
};
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_inner(const MDBX_cursor *mc) {
return (mc->flags & z_inner) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_poor(const MDBX_cursor *mc) {
const bool r = mc->top < 0;
cASSERT(mc, r == (mc->top_and_flags < 0));
if (r && mc->subcur)
cASSERT(mc, mc->subcur->cursor.flags < 0 && mc->subcur->cursor.top < 0);
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_pointed(const MDBX_cursor *mc) {
const bool r = mc->top >= 0;
cASSERT(mc, r == (mc->top_and_flags >= 0));
if (!r && mc->subcur)
cASSERT(mc, is_poor(&mc->subcur->cursor));
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_hollow(const MDBX_cursor *mc) {
const bool r = mc->flags < 0;
if (!r) {
cASSERT(mc, mc->top >= 0);
cASSERT(mc, (mc->flags & z_eof_hard) || mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
} else if (mc->subcur)
cASSERT(mc, is_poor(&mc->subcur->cursor) || (is_pointed(mc) && mc->subcur->cursor.flags < 0));
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_eof(const MDBX_cursor *mc) {
const bool r = z_eof_soft <= (uint8_t)mc->flags;
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_filled(const MDBX_cursor *mc) {
const bool r = z_eof_hard > (uint8_t)mc->flags;
return r;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool inner_filled(const MDBX_cursor *mc) {
return mc->subcur && is_filled(&mc->subcur->cursor);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool inner_pointed(const MDBX_cursor *mc) {
return mc->subcur && is_pointed(&mc->subcur->cursor);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool inner_hollow(const MDBX_cursor *mc) {
const bool r = !mc->subcur || is_hollow(&mc->subcur->cursor);
#if MDBX_DEBUG || MDBX_FORCE_ASSERTIONS
if (!r) {
cASSERT(mc, is_filled(mc));
const page_t *mp = mc->pg[mc->top];
const node_t *node = page_node(mp, mc->ki[mc->top]);
cASSERT(mc, node_flags(node) & N_DUP);
}
#endif /* MDBX_DEBUG || MDBX_FORCE_ASSERTIONS */
return r;
}
MDBX_MAYBE_UNUSED static inline void inner_gone(MDBX_cursor *mc) {
if (mc->subcur) {
TRACE("reset inner cursor %p", __Wpedantic_format_voidptr(&mc->subcur->cursor));
mc->subcur->nested_tree.root = 0;
mc->subcur->cursor.top_and_flags = z_inner | z_poor_mark;
}
}
MDBX_MAYBE_UNUSED static inline void be_poor(MDBX_cursor *mc) {
const bool inner = is_inner(mc);
if (inner) {
mc->tree->root = 0;
mc->top_and_flags = z_inner | z_poor_mark;
} else {
mc->top_and_flags |= z_poor_mark;
inner_gone(mc);
}
cASSERT(mc, is_poor(mc) && !is_pointed(mc) && !is_filled(mc));
cASSERT(mc, inner == is_inner(mc));
}
MDBX_MAYBE_UNUSED static inline void be_filled(MDBX_cursor *mc) {
cASSERT(mc, mc->top >= 0);
cASSERT(mc, mc->ki[mc->top] < page_numkeys(mc->pg[mc->top]));
const bool inner = is_inner(mc);
mc->flags &= z_clear_mask;
cASSERT(mc, is_filled(mc));
cASSERT(mc, inner == is_inner(mc));
}
MDBX_MAYBE_UNUSED static inline bool is_related(const MDBX_cursor *base, const MDBX_cursor *scan) {
cASSERT(base, base->top >= 0);
return base->top <= scan->top && base != scan;
}
/* Флаги контроля/проверки курсора. */
enum cursor_checking {
z_branch = 0x01 /* same as P_BRANCH for check_leaf_type() */,
z_leaf = 0x02 /* same as P_LEAF for check_leaf_type() */,
z_largepage = 0x04 /* same as P_LARGE for check_leaf_type() */,
z_updating = 0x08 /* update/rebalance pending */,
z_ignord = 0x10 /* don't check keys ordering */,
z_dupfix = 0x20 /* same as P_DUPFIX for check_leaf_type() */,
z_retiring = 0x40 /* refs to child pages may be invalid */,
z_pagecheck = 0x80 /* perform page checking, see MDBX_VALIDATION */
};
MDBX_INTERNAL int __must_check_result cursor_validate(const MDBX_cursor *mc);
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline size_t cursor_dbi(const MDBX_cursor *mc) {
cASSERT(mc, mc->txn && mc->txn->signature == txn_signature);
size_t dbi = mc->dbi_state - mc->txn->dbi_state;
cASSERT(mc, dbi < mc->txn->env->n_dbi);
return dbi;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool cursor_dbi_changed(const MDBX_cursor *mc) {
return dbi_changed(mc->txn, cursor_dbi(mc));
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t *cursor_dbi_state(const MDBX_cursor *mc) {
return mc->dbi_state;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool cursor_is_gc(const MDBX_cursor *mc) {
return mc->dbi_state == mc->txn->dbi_state + FREE_DBI;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool cursor_is_main(const MDBX_cursor *mc) {
return mc->dbi_state == mc->txn->dbi_state + MAIN_DBI;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool cursor_is_core(const MDBX_cursor *mc) {
return mc->dbi_state < mc->txn->dbi_state + CORE_DBS;
}
MDBX_MAYBE_UNUSED static inline int cursor_dbi_dbg(const MDBX_cursor *mc) {
/* Debugging output value of a cursor's DBI: Negative for a sub-cursor. */
const int dbi = cursor_dbi(mc);
return (mc->flags & z_inner) ? -dbi : dbi;
}
MDBX_MAYBE_UNUSED static inline int __must_check_result cursor_push(MDBX_cursor *mc, page_t *mp, indx_t ki) {
TRACE("pushing page %" PRIaPGNO " on db %d cursor %p", mp->pgno, cursor_dbi_dbg(mc), __Wpedantic_format_voidptr(mc));
if (unlikely(mc->top >= CURSOR_STACK_SIZE - 1)) {
be_poor(mc);
mc->txn->flags |= MDBX_TXN_ERROR;
return MDBX_CURSOR_FULL;
}
mc->top += 1;
mc->pg[mc->top] = mp;
mc->ki[mc->top] = ki;
return MDBX_SUCCESS;
}
MDBX_MAYBE_UNUSED static inline void cursor_pop(MDBX_cursor *mc) {
TRACE("popped page %" PRIaPGNO " off db %d cursor %p", mc->pg[mc->top]->pgno, cursor_dbi_dbg(mc),
__Wpedantic_format_voidptr(mc));
cASSERT(mc, mc->top >= 0);
mc->top -= 1;
}
MDBX_NOTHROW_PURE_FUNCTION static inline bool check_leaf_type(const MDBX_cursor *mc, const page_t *mp) {
return (((page_type(mp) ^ mc->checking) & (z_branch | z_leaf | z_largepage | z_dupfix)) == 0);
}
MDBX_INTERNAL int cursor_check(const MDBX_cursor *mc, int txn_bad_bits);
/* без необходимости доступа к данным, без активации припаркованных транзакций. */
static inline int cursor_check_pure(const MDBX_cursor *mc) {
return cursor_check(mc, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED);
}
/* для чтения данных, с активацией припаркованных транзакций. */
static inline int cursor_check_ro(const MDBX_cursor *mc) { return cursor_check(mc, MDBX_TXN_BLOCKED); }
/* для записи данных. */
static inline int cursor_check_rw(const MDBX_cursor *mc) {
return cursor_check(mc, (MDBX_TXN_BLOCKED - MDBX_TXN_PARKED) | MDBX_TXN_RDONLY);
}
MDBX_INTERNAL MDBX_cursor *cursor_eot(MDBX_cursor *cursor, MDBX_txn *txn);
MDBX_INTERNAL int cursor_shadow(MDBX_cursor *cursor, MDBX_txn *nested, const size_t dbi);
MDBX_INTERNAL MDBX_cursor *cursor_cpstk(const MDBX_cursor *csrc, MDBX_cursor *cdst);
MDBX_INTERNAL int __must_check_result cursor_ops(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
const MDBX_cursor_op op);
MDBX_INTERNAL int __must_check_result cursor_check_multiple(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
unsigned flags);
MDBX_INTERNAL int __must_check_result cursor_put_checklen(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
unsigned flags);
MDBX_INTERNAL int __must_check_result cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, unsigned flags);
MDBX_INTERNAL int __must_check_result cursor_validate_updating(MDBX_cursor *mc);
MDBX_INTERNAL int __must_check_result cursor_del(MDBX_cursor *mc, unsigned flags);
MDBX_INTERNAL int __must_check_result cursor_sibling_left(MDBX_cursor *mc);
MDBX_INTERNAL int __must_check_result cursor_sibling_right(MDBX_cursor *mc);
typedef struct cursor_set_result {
int err;
bool exact;
} csr_t;
MDBX_INTERNAL csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op);
MDBX_INTERNAL int __must_check_result inner_first(MDBX_cursor *__restrict mc, MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result inner_last(MDBX_cursor *__restrict mc, MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result outer_first(MDBX_cursor *__restrict mc, MDBX_val *__restrict key,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result outer_last(MDBX_cursor *__restrict mc, MDBX_val *__restrict key,
MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result inner_next(MDBX_cursor *__restrict mc, MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result inner_prev(MDBX_cursor *__restrict mc, MDBX_val *__restrict data);
MDBX_INTERNAL int __must_check_result outer_next(MDBX_cursor *__restrict mc, MDBX_val *__restrict key,
MDBX_val *__restrict data, MDBX_cursor_op op);
MDBX_INTERNAL int __must_check_result outer_prev(MDBX_cursor *__restrict mc, MDBX_val *__restrict key,
MDBX_val *__restrict data, MDBX_cursor_op op);
MDBX_INTERNAL int cursor_init4walk(cursor_couple_t *couple, const MDBX_txn *const txn, tree_t *const tree,
kvx_t *const kvx);
MDBX_INTERNAL int __must_check_result cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, size_t dbi);
MDBX_INTERNAL int __must_check_result cursor_dupsort_setup(MDBX_cursor *mc, const node_t *node, const page_t *mp);
MDBX_INTERNAL int __must_check_result cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, const MDBX_val *data);
/*----------------------------------------------------------------------------*/
/* Update sub-page pointer, if any, in mc->subcur.
* Needed when the node which contains the sub-page may have moved.
* Called with mp = mc->pg[mc->top], ki = mc->ki[mc->top]. */
MDBX_MAYBE_UNUSED static inline void cursor_inner_refresh(const MDBX_cursor *mc, const page_t *mp, unsigned ki) {
cASSERT(mc, is_leaf(mp));
const node_t *node = page_node(mp, ki);
if ((node_flags(node) & (N_DUP | N_TREE)) == N_DUP)
mc->subcur->cursor.pg[0] = node_data(node);
}
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool cursor_is_tracked(const MDBX_cursor *mc);
static inline void cursor_reset(cursor_couple_t *couple) {
couple->outer.top_and_flags = z_fresh_mark;
couple->inner.cursor.top_and_flags = z_fresh_mark | z_inner;
}
static inline void cursor_drown(cursor_couple_t *couple) {
couple->outer.top_and_flags = z_poor_mark;
couple->inner.cursor.top_and_flags = z_poor_mark | z_inner;
couple->outer.txn = nullptr;
couple->inner.cursor.txn = nullptr;
couple->outer.tree = nullptr;
/* сохраняем clc-указатель, так он используется для вычисления dbi в mdbx_cursor_renew(). */
couple->outer.dbi_state = nullptr;
couple->inner.cursor.dbi_state = nullptr;
}

692
src/dbi.c
View File

@ -1,692 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
#if MDBX_ENABLE_DBI_SPARSE
size_t dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi) {
tASSERT(txn, bmi > 0);
bmi &= -bmi;
if (sizeof(txn->dbi_sparse[0]) > 4) {
static const uint8_t debruijn_ctz64[64] = {0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12};
return debruijn_ctz64[(UINT64_C(0x022FDD63CC95386D) * (uint64_t)bmi) >> 58];
} else {
static const uint8_t debruijn_ctz32[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return debruijn_ctz32[(UINT32_C(0x077CB531) * (uint32_t)bmi) >> 27];
}
}
#endif /* MDBX_ENABLE_DBI_SPARSE */
struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) {
eASSERT(env, dbi < env->n_dbi);
struct dbi_snap_result r;
uint32_t snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
do {
r.sequence = snap;
r.flags = env->dbs_flags[dbi];
snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
} while (unlikely(snap != r.sequence));
return r;
}
__noinline int dbi_import(MDBX_txn *txn, const size_t dbi) {
const MDBX_env *const env = txn->env;
if (dbi >= env->n_dbi || !env->dbs_flags[dbi])
return MDBX_BAD_DBI;
#if MDBX_ENABLE_DBI_SPARSE
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
const size_t bitmap_indx = dbi / bitmap_chunk;
const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk;
if (dbi >= txn->n_dbi) {
for (size_t i = (txn->n_dbi + bitmap_chunk - 1) / bitmap_chunk; bitmap_indx >= i; ++i)
txn->dbi_sparse[i] = 0;
eASSERT(env, (txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0);
MDBX_txn *scan = txn;
do {
eASSERT(env, scan->dbi_sparse == txn->dbi_sparse);
eASSERT(env, scan->n_dbi < dbi + 1);
scan->n_dbi = (unsigned)dbi + 1;
scan->dbi_state[dbi] = 0;
scan = scan->parent;
} while (scan /* && scan->dbi_sparse == txn->dbi_sparse */);
txn->dbi_sparse[bitmap_indx] |= bitmap_mask;
goto lindo;
}
if ((txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0) {
MDBX_txn *scan = txn;
do {
eASSERT(env, scan->dbi_sparse == txn->dbi_sparse);
eASSERT(env, scan->n_dbi == txn->n_dbi);
scan->dbi_state[dbi] = 0;
scan = scan->parent;
} while (scan /* && scan->dbi_sparse == txn->dbi_sparse */);
txn->dbi_sparse[bitmap_indx] |= bitmap_mask;
goto lindo;
}
#else
if (dbi >= txn->n_dbi) {
size_t i = txn->n_dbi;
do
txn->dbi_state[i] = 0;
while (dbi >= ++i);
txn->n_dbi = i;
goto lindo;
}
#endif /* MDBX_ENABLE_DBI_SPARSE */
if (!txn->dbi_state[dbi]) {
lindo:
/* dbi-слот еще не инициализирован в транзакции, а хендл не использовался */
txn->cursors[dbi] = nullptr;
MDBX_txn *const parent = txn->parent;
if (parent) {
/* вложенная пишущая транзакция */
int rc = dbi_check(parent, dbi);
/* копируем состояние dbi-хендла очищая new-флаги. */
eASSERT(env, txn->dbi_seqs == parent->dbi_seqs);
txn->dbi_state[dbi] = parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
if (likely(rc == MDBX_SUCCESS)) {
txn->dbs[dbi] = parent->dbs[dbi];
rc = txn_shadow_cursors(parent, dbi);
}
return rc;
}
txn->dbi_seqs[dbi] = 0;
txn->dbi_state[dbi] = DBI_LINDO;
} else {
eASSERT(env, txn->dbi_seqs[dbi] != env->dbi_seqs[dbi].weak);
if (unlikely((txn->dbi_state[dbi] & (DBI_VALID | DBI_OLDEN)) || txn->cursors[dbi])) {
/* хендл уже использовался в транзакции, но был закрыт или переоткрыт,
* либо при явном пере-открытии хендла есть висячие курсоры */
eASSERT(env, (txn->dbi_state[dbi] & DBI_STALE) == 0);
txn->dbi_seqs[dbi] = env->dbi_seqs[dbi].weak;
txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO;
return txn->cursors[dbi] ? MDBX_DANGLING_DBI : MDBX_BAD_DBI;
}
}
/* хендл не использовался в транзакции, либо явно пере-отрывается при
* отсутствии висячих курсоров */
eASSERT(env, (txn->dbi_state[dbi] & DBI_LINDO) && !txn->cursors[dbi]);
/* читаем актуальные флаги и sequence */
struct dbi_snap_result snap = dbi_snap(env, dbi);
txn->dbi_seqs[dbi] = snap.sequence;
if (snap.flags & DB_VALID) {
txn->dbs[dbi].flags = snap.flags & DB_PERSISTENT_FLAGS;
txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_STALE;
return MDBX_SUCCESS;
}
return MDBX_BAD_DBI;
}
int dbi_defer_release(MDBX_env *const env, defer_free_item_t *const chain) {
size_t length = 0;
defer_free_item_t *obsolete_chain = nullptr;
#if MDBX_ENABLE_DBI_LOCKFREE
const uint64_t now = osal_monotime();
defer_free_item_t **scan = &env->defer_free;
if (env->defer_free) {
const uint64_t threshold_1second = osal_16dot16_to_monotime(1 * 65536);
do {
defer_free_item_t *item = *scan;
if (now - item->timestamp < threshold_1second) {
scan = &item->next;
length += 1;
} else {
*scan = item->next;
item->next = obsolete_chain;
obsolete_chain = item;
}
} while (*scan);
}
eASSERT(env, *scan == nullptr);
if (chain) {
defer_free_item_t *item = chain;
do {
item->timestamp = now;
item = item->next;
} while (item);
*scan = chain;
}
#else /* MDBX_ENABLE_DBI_LOCKFREE */
obsolete_chain = chain;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
ENSURE(env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS);
if (length > 42) {
#if defined(_WIN32) || defined(_WIN64)
SwitchToThread();
#else
sched_yield();
#endif /* Windows */
}
while (obsolete_chain) {
defer_free_item_t *item = obsolete_chain;
obsolete_chain = obsolete_chain->next;
osal_free(item);
}
return chain ? MDBX_SUCCESS : MDBX_BAD_DBI;
}
/* Export or close DBI handles opened in this txn. */
int dbi_update(MDBX_txn *txn, bool keep) {
MDBX_env *const env = txn->env;
tASSERT(txn, !txn->parent && txn == env->basal_txn);
bool locked = false;
defer_free_item_t *defer_chain = nullptr;
TXN_FOREACH_DBI_USER(txn, dbi) {
if (likely((txn->dbi_state[dbi] & DBI_CREAT) == 0))
continue;
if (!locked) {
int err = osal_fastmutex_acquire(&env->dbi_lock);
if (unlikely(err != MDBX_SUCCESS))
return err;
locked = true;
if (dbi >= env->n_dbi)
/* хендл был закрыт из другого потока пока захватывали блокировку */
continue;
}
tASSERT(txn, dbi < env->n_dbi);
if (keep) {
env->dbs_flags[dbi] = txn->dbs[dbi].flags | DB_VALID;
} else {
uint32_t seq = dbi_seq_next(env, dbi);
defer_free_item_t *item = env->kvs[dbi].name.iov_base;
if (item) {
env->dbs_flags[dbi] = 0;
env->kvs[dbi].name.iov_len = 0;
env->kvs[dbi].name.iov_base = nullptr;
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
osal_flush_incoherent_cpu_writeback();
item->next = defer_chain;
defer_chain = item;
} else {
eASSERT(env, env->kvs[dbi].name.iov_len == 0);
eASSERT(env, env->dbs_flags[dbi] == 0);
}
}
}
if (locked) {
size_t i = env->n_dbi;
eASSERT(env, env->n_dbi >= CORE_DBS);
while ((env->dbs_flags[i - 1] & DB_VALID) == 0) {
--i;
eASSERT(env, i >= CORE_DBS);
eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len && !env->kvs[i].name.iov_base);
}
env->n_dbi = (unsigned)i;
dbi_defer_release(env, defer_chain);
}
return MDBX_SUCCESS;
}
int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
const MDBX_env *const env = txn->env;
eASSERT(env, dbi < txn->n_dbi && dbi < env->n_dbi);
eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO);
eASSERT(env, env->dbs_flags[dbi] != DB_POISON);
if ((env->dbs_flags[dbi] & DB_VALID) == 0) {
eASSERT(env, !env->kvs[dbi].clc.k.cmp && !env->kvs[dbi].clc.v.cmp && !env->kvs[dbi].name.iov_len &&
!env->kvs[dbi].name.iov_base && !env->kvs[dbi].clc.k.lmax && !env->kvs[dbi].clc.k.lmin &&
!env->kvs[dbi].clc.v.lmax && !env->kvs[dbi].clc.v.lmin);
} else {
eASSERT(env, !(txn->dbi_state[dbi] & DBI_VALID) || (txn->dbs[dbi].flags | DB_VALID) == env->dbs_flags[dbi]);
eASSERT(env, env->kvs[dbi].name.iov_base || dbi < CORE_DBS);
}
/* Если dbi уже использовался, то корректными считаем четыре варианта:
* 1) user_flags равны MDBX_DB_ACCEDE
* = предполагаем что пользователь открывает существующую table,
* при этом код проверки не позволит установить другие компараторы.
* 2) user_flags нулевые, а оба компаратора пустые/нулевые или равны текущим
* = предполагаем что пользователь открывает существующую table
* старым способом с нулевыми с флагами по-умолчанию.
* 3) user_flags совпадают, а компараторы не заданы или те же
* = предполагаем что пользователь открывает table указывая все параметры;
* 4) user_flags отличаются, но table пустая и задан флаг MDBX_CREATE
* = предполагаем что пользователь пересоздает table;
*/
if ((user_flags & ~MDBX_CREATE) != (unsigned)(env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS)) {
/* flags are differs, check other conditions */
if ((!user_flags && (!keycmp || keycmp == env->kvs[dbi].clc.k.cmp) &&
(!datacmp || datacmp == env->kvs[dbi].clc.v.cmp)) ||
user_flags == MDBX_DB_ACCEDE) {
user_flags = env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS;
} else if ((user_flags & MDBX_CREATE) == 0)
return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
else {
if (txn->dbi_state[dbi] & DBI_STALE) {
eASSERT(env, env->dbs_flags[dbi] & DB_VALID);
int err = tbl_fetch(txn, dbi);
if (unlikely(err == MDBX_SUCCESS))
return err;
}
eASSERT(env, ((env->dbs_flags[dbi] ^ txn->dbs[dbi].flags) & DB_PERSISTENT_FLAGS) == 0);
eASSERT(env, (txn->dbi_state[dbi] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == (DBI_LINDO | DBI_VALID));
if (unlikely(txn->dbs[dbi].leaf_pages))
return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
/* Пересоздаём table если там пусто */
if (unlikely(txn->cursors[dbi]))
return MDBX_DANGLING_DBI;
env->dbs_flags[dbi] = DB_POISON;
atomic_store32(&env->dbi_seqs[dbi], dbi_seq_next(env, dbi), mo_AcquireRelease);
const uint32_t seq = dbi_seq_next(env, dbi);
const uint16_t db_flags = user_flags & DB_PERSISTENT_FLAGS;
eASSERT(env, txn->dbs[dbi].height == 0 && txn->dbs[dbi].items == 0 && txn->dbs[dbi].root == P_INVALID);
env->kvs[dbi].clc.k.cmp = keycmp ? keycmp : builtin_keycmp(user_flags);
env->kvs[dbi].clc.v.cmp = datacmp ? datacmp : builtin_datacmp(user_flags);
txn->dbs[dbi].flags = db_flags;
txn->dbs[dbi].dupfix_size = 0;
if (unlikely(tbl_setup(env, &env->kvs[dbi], &txn->dbs[dbi]))) {
txn->dbi_state[dbi] = DBI_LINDO;
txn->flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
env->dbs_flags[dbi] = db_flags | DB_VALID;
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
txn->dbi_seqs[dbi] = seq;
txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_CREAT | DBI_DIRTY;
txn->flags |= MDBX_TXN_DIRTY;
}
}
if (!keycmp)
keycmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.k.cmp : builtin_keycmp(user_flags);
if (env->kvs[dbi].clc.k.cmp != keycmp) {
if (env->dbs_flags[dbi] & DB_VALID)
return MDBX_EINVAL;
env->kvs[dbi].clc.k.cmp = keycmp;
}
if (!datacmp)
datacmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.v.cmp : builtin_datacmp(user_flags);
if (env->kvs[dbi].clc.v.cmp != datacmp) {
if (env->dbs_flags[dbi] & DB_VALID)
return MDBX_EINVAL;
env->kvs[dbi].clc.v.cmp = datacmp;
}
return MDBX_SUCCESS;
}
static inline size_t dbi_namelen(const MDBX_val name) {
return (name.iov_len > sizeof(defer_free_item_t)) ? name.iov_len : sizeof(defer_free_item_t);
}
static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp, MDBX_val name) {
MDBX_env *const env = txn->env;
/* Cannot mix named table(s) with DUPSORT flags */
tASSERT(txn, (txn->dbi_state[MAIN_DBI] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == (DBI_LINDO | DBI_VALID));
if (unlikely(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT)) {
if (unlikely((user_flags & MDBX_CREATE) == 0))
return MDBX_NOTFOUND;
if (unlikely(txn->dbs[MAIN_DBI].leaf_pages))
/* В MainDB есть записи, либо она уже использовалась. */
return MDBX_INCOMPATIBLE;
/* Пересоздаём MainDB когда там пусто. */
tASSERT(txn,
txn->dbs[MAIN_DBI].height == 0 && txn->dbs[MAIN_DBI].items == 0 && txn->dbs[MAIN_DBI].root == P_INVALID);
if (unlikely(txn->cursors[MAIN_DBI]))
return MDBX_DANGLING_DBI;
env->dbs_flags[MAIN_DBI] = DB_POISON;
atomic_store32(&env->dbi_seqs[MAIN_DBI], dbi_seq_next(env, MAIN_DBI), mo_AcquireRelease);
const uint32_t seq = dbi_seq_next(env, MAIN_DBI);
const uint16_t main_flags = txn->dbs[MAIN_DBI].flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY);
env->kvs[MAIN_DBI].clc.k.cmp = builtin_keycmp(main_flags);
env->kvs[MAIN_DBI].clc.v.cmp = builtin_datacmp(main_flags);
txn->dbs[MAIN_DBI].flags = main_flags;
txn->dbs[MAIN_DBI].dupfix_size = 0;
int err = tbl_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]);
if (unlikely(err != MDBX_SUCCESS)) {
txn->dbi_state[MAIN_DBI] = DBI_LINDO;
txn->flags |= MDBX_TXN_ERROR;
env->flags |= ENV_FATAL_ERROR;
return err;
}
env->dbs_flags[MAIN_DBI] = main_flags | DB_VALID;
txn->dbi_seqs[MAIN_DBI] = atomic_store32(&env->dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease);
txn->dbi_state[MAIN_DBI] |= DBI_DIRTY;
txn->flags |= MDBX_TXN_DIRTY;
}
tASSERT(txn, env->kvs[MAIN_DBI].clc.k.cmp);
/* Is the DB already open? */
size_t slot = env->n_dbi;
for (size_t scan = CORE_DBS; scan < env->n_dbi; ++scan) {
if ((env->dbs_flags[scan] & DB_VALID) == 0) {
/* Remember this free slot */
slot = (slot < scan) ? slot : scan;
continue;
}
if (!env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[scan].name)) {
slot = scan;
int err = dbi_check(txn, slot);
if (err == MDBX_BAD_DBI && txn->dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) {
/* хендл использовался, стал невалидным,
* но теперь явно пере-открывается в этой транзакци */
eASSERT(env, !txn->cursors[slot]);
txn->dbi_state[slot] = DBI_LINDO;
err = dbi_check(txn, slot);
}
if (err == MDBX_SUCCESS) {
err = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (likely(err == MDBX_SUCCESS)) {
goto done;
}
}
return err;
}
}
/* Fail, if no free slot and max hit */
if (unlikely(slot >= env->max_dbi))
return MDBX_DBS_FULL;
if (env->n_dbi == slot)
eASSERT(env, !env->dbs_flags[slot] && !env->kvs[slot].name.iov_len && !env->kvs[slot].name.iov_base);
env->dbs_flags[slot] = DB_POISON;
atomic_store32(&env->dbi_seqs[slot], dbi_seq_next(env, slot), mo_AcquireRelease);
memset(&env->kvs[slot], 0, sizeof(env->kvs[slot]));
if (env->n_dbi == slot)
env->n_dbi = (unsigned)slot + 1;
eASSERT(env, slot < env->n_dbi);
int err = dbi_check(txn, slot);
eASSERT(env, err == MDBX_BAD_DBI);
if (err != MDBX_BAD_DBI)
return MDBX_PROBLEM;
/* Find the DB info */
MDBX_val body;
cursor_couple_t cx;
int rc = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
rc = cursor_seek(&cx.outer, &name, &body, MDBX_SET).err;
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE))
return rc;
} else {
/* make sure this is actually a table */
node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]);
if (unlikely((node_flags(node) & (N_DUP | N_TREE)) != N_TREE))
return MDBX_INCOMPATIBLE;
if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(tree_t))) {
ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid table node size", body.iov_len);
return MDBX_CORRUPTED;
}
memcpy(&txn->dbs[slot], body.iov_base, sizeof(tree_t));
}
/* Done here so we cannot fail after creating a new DB */
defer_free_item_t *const clone = osal_malloc(dbi_namelen(name));
if (unlikely(!clone))
return MDBX_ENOMEM;
memcpy(clone, name.iov_base, name.iov_len);
name.iov_base = clone;
uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH;
if (unlikely(rc)) {
/* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */
tASSERT(txn, rc == MDBX_NOTFOUND);
body.iov_base = memset(&txn->dbs[slot], 0, body.iov_len = sizeof(tree_t));
txn->dbs[slot].root = P_INVALID;
txn->dbs[slot].mod_txnid = txn->txnid;
txn->dbs[slot].flags = user_flags & DB_PERSISTENT_FLAGS;
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
rc = cursor_put_checklen(&cx.outer, &name, &body, N_TREE | MDBX_NOOVERWRITE);
txn->cursors[MAIN_DBI] = cx.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
dbi_state |= DBI_DIRTY | DBI_CREAT;
txn->flags |= MDBX_TXN_DIRTY;
tASSERT(txn, (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) != 0);
}
/* Got info, register DBI in this txn */
const uint32_t seq = dbi_seq_next(env, slot);
eASSERT(env, env->dbs_flags[slot] == DB_POISON && !txn->cursors[slot] &&
(txn->dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO);
txn->dbi_state[slot] = dbi_state;
memcpy(&txn->dbs[slot], body.iov_base, sizeof(txn->dbs[slot]));
env->dbs_flags[slot] = txn->dbs[slot].flags;
rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
env->kvs[slot].name = name;
env->dbs_flags[slot] = txn->dbs[slot].flags | DB_VALID;
txn->dbi_seqs[slot] = atomic_store32(&env->dbi_seqs[slot], seq, mo_AcquireRelease);
done:
*dbi = (MDBX_dbi)slot;
tASSERT(txn, slot < txn->n_dbi && (env->dbs_flags[slot] & DB_VALID) != 0);
eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS);
return MDBX_SUCCESS;
bailout:
eASSERT(env, !txn->cursors[slot] && !env->kvs[slot].name.iov_len && !env->kvs[slot].name.iov_base);
txn->dbi_state[slot] &= DBI_LINDO | DBI_OLDEN;
env->dbs_flags[slot] = 0;
osal_free(clone);
if (slot + 1 == env->n_dbi)
txn->n_dbi = env->n_dbi = (unsigned)slot;
return rc;
}
int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp) {
if (unlikely(!dbi))
return MDBX_EINVAL;
*dbi = 0;
if (user_flags != MDBX_ACCEDE && unlikely(!check_table_flags(user_flags & ~MDBX_CREATE)))
return MDBX_EINVAL;
int rc = check_txn(txn, MDBX_TXN_BLOCKED);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if ((user_flags & MDBX_CREATE) && unlikely(txn->flags & MDBX_TXN_RDONLY))
return MDBX_EACCESS;
/* main table? */
if (unlikely(name == MDBX_CHK_MAIN || name->iov_base == MDBX_CHK_MAIN)) {
rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp);
if (likely(rc == MDBX_SUCCESS))
*dbi = MAIN_DBI;
return rc;
}
if (unlikely(name == MDBX_CHK_GC || name->iov_base == MDBX_CHK_GC)) {
rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp);
if (likely(rc == MDBX_SUCCESS))
*dbi = FREE_DBI;
return rc;
}
if (unlikely(name == MDBX_CHK_META || name->iov_base == MDBX_CHK_META))
return MDBX_EINVAL;
if (unlikely(name->iov_len > txn->env->leaf_nodemax - NODESIZE - sizeof(tree_t)))
return MDBX_EINVAL;
#if MDBX_ENABLE_DBI_LOCKFREE
/* Is the DB already open? */
const MDBX_env *const env = txn->env;
size_t free_slot = env->n_dbi;
for (size_t i = CORE_DBS; i < env->n_dbi; ++i) {
retry:
if ((env->dbs_flags[i] & DB_VALID) == 0) {
free_slot = i;
continue;
}
const uint32_t snap_seq = atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease);
const uint16_t snap_flags = env->dbs_flags[i];
const MDBX_val snap_name = env->kvs[i].name;
if (user_flags != MDBX_ACCEDE &&
(((user_flags ^ snap_flags) & DB_PERSISTENT_FLAGS) || (keycmp && keycmp != env->kvs[i].clc.k.cmp) ||
(datacmp && datacmp != env->kvs[i].clc.v.cmp)))
continue;
const uint32_t main_seq = atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease);
MDBX_cmp_func *const snap_cmp = env->kvs[MAIN_DBI].clc.k.cmp;
if (unlikely(!(snap_flags & DB_VALID) || !snap_name.iov_base || !snap_name.iov_len || !snap_cmp))
continue;
const bool name_match = snap_cmp(&snap_name, name) == 0;
osal_flush_incoherent_cpu_writeback();
if (unlikely(snap_seq != atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease) ||
main_seq != atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease) ||
snap_flags != env->dbs_flags[i] || snap_name.iov_base != env->kvs[i].name.iov_base ||
snap_name.iov_len != env->kvs[i].name.iov_len))
goto retry;
if (name_match) {
rc = dbi_check(txn, i);
if (rc == MDBX_BAD_DBI && txn->dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) {
/* хендл использовался, стал невалидным,
* но теперь явно пере-открывается в этой транзакци */
eASSERT(env, !txn->cursors[i]);
txn->dbi_state[i] = DBI_LINDO;
rc = dbi_check(txn, i);
}
if (likely(rc == MDBX_SUCCESS)) {
rc = dbi_bind(txn, i, user_flags, keycmp, datacmp);
if (likely(rc == MDBX_SUCCESS))
*dbi = (MDBX_dbi)i;
}
return rc;
}
}
/* Fail, if no free slot and max hit */
if (unlikely(free_slot >= env->max_dbi))
return MDBX_DBS_FULL;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
rc = osal_fastmutex_acquire(&txn->env->dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name);
ENSURE(txn->env, osal_fastmutex_release(&txn->env->dbi_lock) == MDBX_SUCCESS);
}
return rc;
}
__cold struct dbi_rename_result dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) {
struct dbi_rename_result pair;
pair.defer = nullptr;
pair.err = dbi_check(txn, dbi);
if (unlikely(pair.err != MDBX_SUCCESS))
return pair;
MDBX_env *const env = txn->env;
MDBX_val old_name = env->kvs[dbi].name;
if (env->kvs[MAIN_DBI].clc.k.cmp(&new_name, &old_name) == 0 && MDBX_DEBUG == 0)
return pair;
cursor_couple_t cx;
pair.err = cursor_init(&cx.outer, txn, MAIN_DBI);
if (unlikely(pair.err != MDBX_SUCCESS))
return pair;
pair.err = cursor_seek(&cx.outer, &new_name, nullptr, MDBX_SET).err;
if (unlikely(pair.err != MDBX_NOTFOUND)) {
pair.err = (pair.err == MDBX_SUCCESS) ? MDBX_KEYEXIST : pair.err;
return pair;
}
pair.defer = osal_malloc(dbi_namelen(new_name));
if (unlikely(!pair.defer)) {
pair.err = MDBX_ENOMEM;
return pair;
}
new_name.iov_base = memcpy(pair.defer, new_name.iov_base, new_name.iov_len);
cx.outer.next = txn->cursors[MAIN_DBI];
txn->cursors[MAIN_DBI] = &cx.outer;
MDBX_val data = {&txn->dbs[dbi], sizeof(tree_t)};
pair.err = cursor_put_checklen(&cx.outer, &new_name, &data, N_TREE | MDBX_NOOVERWRITE);
if (likely(pair.err == MDBX_SUCCESS)) {
pair.err = cursor_seek(&cx.outer, &old_name, nullptr, MDBX_SET).err;
if (likely(pair.err == MDBX_SUCCESS))
pair.err = cursor_del(&cx.outer, N_TREE);
if (likely(pair.err == MDBX_SUCCESS)) {
pair.defer = env->kvs[dbi].name.iov_base;
env->kvs[dbi].name = new_name;
} else
txn->flags |= MDBX_TXN_ERROR;
}
txn->cursors[MAIN_DBI] = cx.outer.next;
return pair;
}
static defer_free_item_t *dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) {
eASSERT(env, dbi >= CORE_DBS);
if (unlikely(dbi >= env->n_dbi))
return nullptr;
const uint32_t seq = dbi_seq_next(env, dbi);
defer_free_item_t *defer_item = env->kvs[dbi].name.iov_base;
if (likely(defer_item)) {
env->dbs_flags[dbi] = 0;
env->kvs[dbi].name.iov_len = 0;
env->kvs[dbi].name.iov_base = nullptr;
atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease);
osal_flush_incoherent_cpu_writeback();
defer_item->next = nullptr;
if (env->n_dbi == dbi + 1) {
size_t i = env->n_dbi;
do {
--i;
eASSERT(env, i >= CORE_DBS);
eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len && !env->kvs[i].name.iov_base);
} while (i > CORE_DBS && !env->kvs[i - 1].name.iov_base);
env->n_dbi = (unsigned)i;
}
}
return defer_item;
}
__cold const tree_t *dbi_dig(const MDBX_txn *txn, const size_t dbi, tree_t *fallback) {
const MDBX_txn *dig = txn;
do {
tASSERT(txn, txn->n_dbi == dig->n_dbi);
const uint8_t state = dbi_state(dig, dbi);
if (state & DBI_LINDO)
switch (state & (DBI_VALID | DBI_STALE | DBI_OLDEN)) {
case DBI_VALID:
case DBI_OLDEN:
return dig->dbs + dbi;
case 0:
return fallback;
case DBI_VALID | DBI_STALE:
case DBI_OLDEN | DBI_STALE:
break;
default:
tASSERT(txn, !!"unexpected dig->dbi_state[dbi]");
}
dig = dig->parent;
} while (dig);
return fallback;
}
int dbi_close_release(MDBX_env *env, MDBX_dbi dbi) { return dbi_defer_release(env, dbi_close_locked(env, dbi)); }

147
src/dbi.h
View File

@ -1,147 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
#if MDBX_ENABLE_DBI_SPARSE
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL size_t dbi_bitmap_ctz_fallback(const MDBX_txn *txn,
intptr_t bmi);
static inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) {
tASSERT(txn, bmi > 0);
STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->dbi_sparse[0]));
#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
if (sizeof(txn->dbi_sparse[0]) <= sizeof(int))
return __builtin_ctz((int)bmi);
if (sizeof(txn->dbi_sparse[0]) == sizeof(long))
return __builtin_ctzl((long)bmi);
#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || __has_builtin(__builtin_ctzll)
return __builtin_ctzll(bmi);
#endif /* have(long long) && long long == uint64_t */
#endif /* GNU C */
#if defined(_MSC_VER)
unsigned long index;
if (sizeof(txn->dbi_sparse[0]) > 4) {
#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
_BitScanForward64(&index, bmi);
return index;
#else
if (bmi > UINT32_MAX) {
_BitScanForward(&index, (uint32_t)((uint64_t)bmi >> 32));
return index;
}
#endif
}
_BitScanForward(&index, (uint32_t)bmi);
return index;
#endif /* MSVC */
return dbi_bitmap_ctz_fallback(txn, bmi);
}
static inline bool dbi_foreach_step(const MDBX_txn *const txn, size_t *bitmap_item, size_t *dbi) {
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
if (*bitmap_item & 1) {
*bitmap_item >>= 1;
return txn->dbi_state[*dbi] != 0;
}
if (*bitmap_item) {
size_t bitmap_skip = dbi_bitmap_ctz(txn, *bitmap_item);
*bitmap_item >>= bitmap_skip;
*dbi += bitmap_skip - 1;
} else {
*dbi = (*dbi - 1) | (bitmap_chunk - 1);
*bitmap_item = txn->dbi_sparse[(1 + *dbi) / bitmap_chunk];
if (*bitmap_item == 0)
*dbi += bitmap_chunk;
}
return false;
}
/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность
* использования оператора break */
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
for (size_t bitmap_item = TXN->dbi_sparse[0] >> FROM, I = FROM; I < TXN->n_dbi; ++I) \
if (dbi_foreach_step(TXN, &bitmap_item, &I))
#else
#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \
for (size_t I = FROM; I < TXN->n_dbi; ++I) \
if (TXN->dbi_state[I])
#endif /* MDBX_ENABLE_DBI_SPARSE */
#define TXN_FOREACH_DBI_ALL(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, 0)
#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS)
MDBX_INTERNAL int dbi_import(MDBX_txn *txn, const size_t dbi);
struct dbi_snap_result {
uint32_t sequence;
unsigned flags;
};
MDBX_INTERNAL struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi);
MDBX_INTERNAL int dbi_update(MDBX_txn *txn, bool keep);
static inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) {
STATIC_ASSERT((int)DBI_DIRTY == MDBX_DBI_DIRTY && (int)DBI_STALE == MDBX_DBI_STALE &&
(int)DBI_FRESH == MDBX_DBI_FRESH && (int)DBI_CREAT == MDBX_DBI_CREAT);
#if MDBX_ENABLE_DBI_SPARSE
const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]);
const size_t bitmap_indx = dbi / bitmap_chunk;
const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk;
return likely(dbi < txn->n_dbi && (txn->dbi_sparse[bitmap_indx] & bitmap_mask) != 0) ? txn->dbi_state[dbi] : 0;
#else
return likely(dbi < txn->n_dbi) ? txn->dbi_state[dbi] : 0;
#endif /* MDBX_ENABLE_DBI_SPARSE */
}
static inline bool dbi_changed(const MDBX_txn *txn, const size_t dbi) {
const MDBX_env *const env = txn->env;
eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO);
const uint32_t snap_seq = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease);
return unlikely(snap_seq != txn->dbi_seqs[dbi]);
}
static inline int dbi_check(const MDBX_txn *txn, const size_t dbi) {
const uint8_t state = dbi_state(txn, dbi);
if (likely((state & DBI_LINDO) != 0 && !dbi_changed(txn, dbi)))
return (state & DBI_VALID) ? MDBX_SUCCESS : MDBX_BAD_DBI;
/* Медленный путь: ленивая до-инициализацяи и импорт */
return dbi_import((MDBX_txn *)txn, dbi);
}
static inline uint32_t dbi_seq_next(const MDBX_env *const env, size_t dbi) {
uint32_t v = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease) + 1;
return v ? v : 1;
}
MDBX_INTERNAL int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags, MDBX_dbi *dbi,
MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp);
MDBX_INTERNAL int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, MDBX_cmp_func *keycmp,
MDBX_cmp_func *datacmp);
typedef struct defer_free_item {
struct defer_free_item *next;
uint64_t timestamp;
} defer_free_item_t;
MDBX_INTERNAL int dbi_defer_release(MDBX_env *const env, defer_free_item_t *const chain);
MDBX_INTERNAL int dbi_close_release(MDBX_env *env, MDBX_dbi dbi);
MDBX_INTERNAL const tree_t *dbi_dig(const MDBX_txn *txn, const size_t dbi, tree_t *fallback);
struct dbi_rename_result {
defer_free_item_t *defer;
int err;
};
MDBX_INTERNAL struct dbi_rename_result dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name);

View File

@ -1,36 +1,34 @@
#if defined(__GNUC__) && !defined(__LCC__) #pragma push_macro("mdbx_trace")
#pragma push_macro("mdbx_debug")
#pragma push_macro("mdbx_verbose")
#pragma push_macro("mdbx_notice")
#pragma push_macro("mdbx_warning")
#pragma push_macro("mdbx_error")
#pragma push_macro("mdbx_assert")
#pragma push_macro("TRACE") #undef mdbx_trace
#pragma push_macro("DEBUG") #define mdbx_trace(fmt, ...) \
#pragma push_macro("VERBOSE") mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__)
#pragma push_macro("NOTICE")
#pragma push_macro("WARNING")
#pragma push_macro("ERROR")
#pragma push_macro("eASSERT")
#undef TRACE #undef mdbx_debug
#define TRACE(fmt, ...) debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__) #define mdbx_debug(fmt, ...) \
mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__)
#undef DEBUG #undef mdbx_verbose
#define DEBUG(fmt, ...) debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__) #define mdbx_verbose(fmt, ...) \
mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__)
#undef VERBOSE #undef mdbx_notice
#define VERBOSE(fmt, ...) debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__) #define mdbx_notice(fmt, ...) \
mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__)
#undef NOTICE #undef mdbx_warning
#define NOTICE(fmt, ...) debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__) #define mdbx_warning(fmt, ...) \
mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__)
#undef WARNING #undef mdbx_error
#define WARNING(fmt, ...) debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__) #define mdbx_error(fmt, ...) \
mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__)
#undef ERROR #undef mdbx_assert
#define ERROR(fmt, ...) debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__) #define mdbx_assert(env, expr) mdbx_ensure(env, expr)
#undef eASSERT
#define eASSERT(env, expr) ENSURE(env, expr)
#if !defined(__clang__)
#pragma GCC optimize("-Og")
#endif
#endif /* GCC only */

View File

@ -1,15 +1,7 @@
#if defined(__GNUC__) && !defined(__LCC__) #pragma pop_macro("mdbx_trace")
#pragma pop_macro("mdbx_debug")
#pragma pop_macro("TRACE") #pragma pop_macro("mdbx_verbose")
#pragma pop_macro("DEBUG") #pragma pop_macro("mdbx_notice")
#pragma pop_macro("VERBOSE") #pragma pop_macro("mdbx_warning")
#pragma pop_macro("NOTICE") #pragma pop_macro("mdbx_error")
#pragma pop_macro("WARNING") #pragma pop_macro("mdbx_assert")
#pragma pop_macro("ERROR")
#pragma pop_macro("eASSERT")
#if !defined(__clang__)
#pragma GCC reset_options
#endif
#endif /* GCC only */

425
src/defs.h Normal file
View File

@ -0,0 +1,425 @@
/*
* Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
#pragma once
/* *INDENT-OFF* */
/* clang-format off */
#ifndef __GNUC_PREREQ
# if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define __GNUC_PREREQ(maj, min) \
((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
# else
# define __GNUC_PREREQ(maj, min) (0)
# endif
#endif /* __GNUC_PREREQ */
#ifndef __CLANG_PREREQ
# ifdef __clang__
# define __CLANG_PREREQ(maj,min) \
((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min))
# else
# define __CLANG_PREREQ(maj,min) (0)
# endif
#endif /* __CLANG_PREREQ */
#ifndef __GLIBC_PREREQ
# if defined(__GLIBC__) && defined(__GLIBC_MINOR__)
# define __GLIBC_PREREQ(maj, min) \
((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min))
# else
# define __GLIBC_PREREQ(maj, min) (0)
# endif
#endif /* __GLIBC_PREREQ */
#ifndef __has_attribute
# define __has_attribute(x) (0)
#endif
#ifndef __has_feature
# define __has_feature(x) (0)
#endif
#ifndef __has_extension
# define __has_extension(x) (0)
#endif
#ifndef __has_builtin
# define __has_builtin(x) (0)
#endif
#ifndef __has_warning
# define __has_warning(x) (0)
#endif
#ifndef __has_include
# define __has_include(x) (0)
#endif
#if __has_feature(thread_sanitizer)
# define __SANITIZE_THREAD__ 1
#endif
#if __has_feature(address_sanitizer)
# define __SANITIZE_ADDRESS__ 1
#endif
/*----------------------------------------------------------------------------*/
#ifndef __extern_C
# ifdef __cplusplus
# define __extern_C extern "C"
# else
# define __extern_C
# endif
#endif /* __extern_C */
#ifndef __cplusplus
# ifndef bool
# define bool _Bool
# endif
# ifndef true
# define true (1)
# endif
# ifndef false
# define false (0)
# endif
#endif
#if !defined(nullptr) && !defined(__cplusplus) || (__cplusplus < 201103L && !defined(_MSC_VER))
# define nullptr NULL
#endif
/*----------------------------------------------------------------------------*/
#ifndef __always_inline
# if defined(__GNUC__) || __has_attribute(__always_inline__)
# define __always_inline __inline __attribute__((__always_inline__))
# elif defined(_MSC_VER)
# define __always_inline __forceinline
# else
# define __always_inline
# endif
#endif /* __always_inline */
#ifndef __noinline
# if defined(__GNUC__) || __has_attribute(__noinline__)
# define __noinline __attribute__((__noinline__))
# elif defined(_MSC_VER)
# define __noinline __declspec(noinline)
# else
# define __noinline
# endif
#endif /* __noinline */
#ifndef __must_check_result
# if defined(__GNUC__) || __has_attribute(__warn_unused_result__)
# define __must_check_result __attribute__((__warn_unused_result__))
# else
# define __must_check_result
# endif
#endif /* __must_check_result */
#ifndef __maybe_unused
# if defined(__GNUC__) || __has_attribute(__unused__)
# define __maybe_unused __attribute__((__unused__))
# else
# define __maybe_unused
# endif
#endif /* __maybe_unused */
#if !defined(__noop) && !defined(_MSC_VER)
# define __noop(...) do {} while(0)
#endif /* __noop */
#ifndef __fallthrough
# if __GNUC_PREREQ(7, 0) || __has_attribute(__fallthrough__)
# define __fallthrough __attribute__((__fallthrough__))
# else
# define __fallthrough __noop()
# endif
#endif /* __fallthrough */
#ifndef __unreachable
# if __GNUC_PREREQ(4,5) || __has_builtin(__builtin_unreachable)
# define __unreachable() __builtin_unreachable()
# elif defined(_MSC_VER)
# define __unreachable() __assume(0)
# else
# define __unreachable() __noop()
# endif
#endif /* __unreachable */
#ifndef __prefetch
# if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch)
# define __prefetch(ptr) __builtin_prefetch(ptr)
# else
# define __prefetch(ptr) __noop(ptr)
# endif
#endif /* __prefetch */
#ifndef __noreturn
# if defined(__GNUC__) || __has_attribute(__noreturn__)
# define __noreturn __attribute__((__noreturn__))
# elif defined(_MSC_VER)
# define __noreturn __declspec(noreturn)
# else
# define __noreturn
# endif
#endif /* __noreturn */
#ifndef __nothrow
# if defined(__cplusplus)
# if __cplusplus < 201703L
# define __nothrow throw()
# else
# define __nothrow noexcept(true)
# endif /* __cplusplus */
# elif defined(__GNUC__) || __has_attribute(__nothrow__)
# define __nothrow __attribute__((__nothrow__))
# elif defined(_MSC_VER) && defined(__cplusplus)
# define __nothrow __declspec(nothrow)
# else
# define __nothrow
# endif
#endif /* __nothrow */
#ifndef __pure_function
/* Many functions have no effects except the return value and their
* return value depends only on the parameters and/or global variables.
* Such a function can be subject to common subexpression elimination
* and loop optimization just as an arithmetic operator would be.
* These functions should be declared with the attribute pure. */
# if defined(__GNUC__) || __has_attribute(__pure__)
# define __pure_function __attribute__((__pure__))
# else
# define __pure_function
# endif
#endif /* __pure_function */
#ifndef __const_function
/* Many functions do not examine any values except their arguments,
* and have no effects except the return value. Basically this is just
* slightly more strict class than the PURE attribute, since function
* is not allowed to read global memory.
*
* Note that a function that has pointer arguments and examines the
* data pointed to must not be declared const. Likewise, a function
* that calls a non-const function usually must not be const.
* It does not make sense for a const function to return void. */
# if defined(__GNUC__) || __has_attribute(__const__)
# define __const_function __attribute__((__const__))
# else
# define __const_function
# endif
#endif /* __const_function */
#ifndef __hidden
# if defined(__GNUC__) || __has_attribute(__visibility__)
# define __hidden __attribute__((__visibility__("hidden")))
# else
# define __hidden
# endif
#endif /* __hidden */
#ifndef __optimize
# if defined(__OPTIMIZE__)
# if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__)
# define __optimize(ops) __attribute__((__optimize__(ops)))
# else
# define __optimize(ops)
# endif
# else
# define __optimize(ops)
# endif
#endif /* __optimize */
#ifndef __hot
# if defined(__OPTIMIZE__)
# if defined(__e2k__)
# define __hot __attribute__((__hot__)) __optimize(3)
# elif defined(__clang__) && !__has_attribute(__hot_) \
&& __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
/* just put frequently used functions in separate section */
# define __hot __attribute__((__section__("text.hot"))) __optimize("O3")
# elif defined(__GNUC__) || __has_attribute(__hot__)
# define __hot __attribute__((__hot__)) __optimize("O3")
# else
# define __hot __optimize("O3")
# endif
# else
# define __hot
# endif
#endif /* __hot */
#ifndef __cold
# if defined(__OPTIMIZE__)
# if defined(__e2k__)
# define __cold __attribute__((__cold__)) __optimize(1)
# elif defined(__clang__) && !__has_attribute(cold) \
&& __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
/* just put infrequently used functions in separate section */
# define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os")
# elif defined(__GNUC__) || __has_attribute(cold)
# define __cold __attribute__((__cold__)) __optimize("Os")
# else
# define __cold __optimize("Os")
# endif
# else
# define __cold
# endif
#endif /* __cold */
#ifndef __flatten
# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__))
# define __flatten __attribute__((__flatten__))
# else
# define __flatten
# endif
#endif /* __flatten */
#ifndef likely
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
# define likely(x) (x)
# endif
#endif /* likely */
#ifndef unlikely
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
# define unlikely(x) (x)
# endif
#endif /* unlikely */
#ifndef __printf_args
# if defined(__GNUC__) || __has_attribute(__format__)
# define __printf_args(format_index, first_arg) \
__attribute__((__format__(__printf__, format_index, first_arg)))
# else
# define __printf_args(format_index, first_arg)
# endif
#endif /* __printf_args */
#ifndef __anonymous_struct_extension__
# if defined(__GNUC__)
# define __anonymous_struct_extension__ __extension__
# else
# define __anonymous_struct_extension__
# endif
#endif /* __anonymous_struct_extension__ */
#ifndef __Wpedantic_format_voidptr
static __inline __maybe_unused const void* __pure_function
__Wpedantic_format_voidptr(const void* ptr) {return ptr;}
# define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
#endif /* __Wpedantic_format_voidptr */
/*----------------------------------------------------------------------------*/
#if defined(MDBX_USE_VALGRIND)
# include <valgrind/memcheck.h>
# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE
/* LY: available since Valgrind 3.10 */
# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
# endif
#elif !defined(RUNNING_ON_VALGRIND)
# define VALGRIND_CREATE_MEMPOOL(h,r,z)
# define VALGRIND_DESTROY_MEMPOOL(h)
# define VALGRIND_MEMPOOL_TRIM(h,a,s)
# define VALGRIND_MEMPOOL_ALLOC(h,a,s)
# define VALGRIND_MEMPOOL_FREE(h,a)
# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s)
# define VALGRIND_MAKE_MEM_NOACCESS(a,s)
# define VALGRIND_MAKE_MEM_DEFINED(a,s)
# define VALGRIND_MAKE_MEM_UNDEFINED(a,s)
# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0)
# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0)
# define RUNNING_ON_VALGRIND (0)
#endif /* MDBX_USE_VALGRIND */
#ifdef __SANITIZE_ADDRESS__
# include <sanitizer/asan_interface.h>
#elif !defined(ASAN_POISON_MEMORY_REGION)
# define ASAN_POISON_MEMORY_REGION(addr, size) \
((void)(addr), (void)(size))
# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
((void)(addr), (void)(size))
#endif /* __SANITIZE_ADDRESS__ */
/*----------------------------------------------------------------------------*/
#ifndef ARRAY_LENGTH
# ifdef __cplusplus
template <typename T, size_t N>
char (&__ArraySizeHelper(T (&array)[N]))[N];
# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array)))
# else
# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0]))
# endif
#endif /* ARRAY_LENGTH */
#ifndef ARRAY_END
# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)])
#endif /* ARRAY_END */
#ifndef STRINGIFY
# define STRINGIFY_HELPER(x) #x
# define STRINGIFY(x) STRINGIFY_HELPER(x)
#endif /* STRINGIFY */
#define CONCAT(a,b) a##b
#define XCONCAT(a,b) CONCAT(a,b)
#ifndef offsetof
# define offsetof(type, member) __builtin_offsetof(type, member)
#endif /* offsetof */
#ifndef container_of
# define container_of(ptr, type, member) \
((type *)((char *)(ptr) - offsetof(type, member)))
#endif /* container_of */
#define MDBX_TETRAD(a, b, c, d) \
((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d))
#define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3])
#define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__)
#ifndef STATIC_ASSERT_MSG
# if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \
|| __has_feature(c_static_assert)
# define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg)
# elif defined(static_assert)
# define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg)
# elif defined(_MSC_VER)
# include <crtdbg.h>
# define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
# else
# define STATIC_ASSERT_MSG(expr, msg) switch (0) {case 0:case (expr):;}
# endif
#endif /* STATIC_ASSERT */
#ifndef STATIC_ASSERT
# define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr)
#endif
/* *INDENT-ON* */
/* clang-format on */

488
src/dpl.c
View File

@ -1,488 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
static inline size_t dpl_size2bytes(ptrdiff_t size) {
assert(size > CURSOR_STACK_SIZE && (size_t)size <= PAGELIST_LIMIT);
#if MDBX_DPL_PREALLOC_FOR_RADIXSORT
size += size;
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) +
(PAGELIST_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1)) * sizeof(dp_t) +
MDBX_PNL_GRANULATE * sizeof(void *) * 2 <
SIZE_MAX / 4 * 3);
size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) + size * sizeof(dp_t),
MDBX_PNL_GRANULATE * sizeof(void *) * 2) -
MDBX_ASSUME_MALLOC_OVERHEAD;
return bytes;
}
static inline size_t dpl_bytes2size(const ptrdiff_t bytes) {
size_t size = (bytes - sizeof(dpl_t)) / sizeof(dp_t);
#if MDBX_DPL_PREALLOC_FOR_RADIXSORT
size >>= 1;
#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
assert(size > CURSOR_STACK_SIZE && size <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
return size;
}
void dpl_free(MDBX_txn *txn) {
if (likely(txn->wr.dirtylist)) {
osal_free(txn->wr.dirtylist);
txn->wr.dirtylist = nullptr;
}
}
dpl_t *dpl_reserve(MDBX_txn *txn, size_t size) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
size_t bytes = dpl_size2bytes((size < PAGELIST_LIMIT) ? size : PAGELIST_LIMIT);
dpl_t *const dl = osal_realloc(txn->wr.dirtylist, bytes);
if (likely(dl)) {
#ifdef osal_malloc_usable_size
bytes = osal_malloc_usable_size(dl);
#endif /* osal_malloc_usable_size */
dl->detent = dpl_bytes2size(bytes);
tASSERT(txn, txn->wr.dirtylist == nullptr || dl->length <= dl->detent);
txn->wr.dirtylist = dl;
}
return dl;
}
int dpl_alloc(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const size_t wanna = (txn->env->options.dp_initial < txn->geo.upper) ? txn->env->options.dp_initial : txn->geo.upper;
#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG
if (txn->wr.dirtylist)
/* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */
txn->wr.dirtylist->sorted = txn->wr.dirtylist->length = 0;
#endif /* asertions enabled */
if (unlikely(!txn->wr.dirtylist || txn->wr.dirtylist->detent < wanna || txn->wr.dirtylist->detent > wanna + wanna) &&
unlikely(!dpl_reserve(txn, wanna)))
return MDBX_ENOMEM;
/* LY: wr.dirtylist не может быть nullptr, так как либо уже выделен, либо будет выделен в dpl_reserve(). */
/* coverity[var_deref_model] */
dpl_clear(txn->wr.dirtylist);
return MDBX_SUCCESS;
}
#define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno)
RADIXSORT_IMPL(dp, dp_t, MDBX_DPL_EXTRACT_KEY, MDBX_DPL_PREALLOC_FOR_RADIXSORT, 1)
#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno)
SORT_IMPL(dp_sort, false, dp_t, DP_SORT_CMP)
__hot __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
const size_t unsorted = dl->length - dl->sorted;
if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || unlikely(!dp_radixsort(dl->items + 1, dl->length))) {
if (dl->sorted > unsorted / 4 + 4 &&
(MDBX_DPL_PREALLOC_FOR_RADIXSORT || dl->length + unsorted < dl->detent + dpl_gap_mergesort)) {
dp_t *const sorted_begin = dl->items + 1;
dp_t *const sorted_end = sorted_begin + dl->sorted;
dp_t *const end =
dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT ? dl->length + dl->length + 1 : dl->detent + dpl_reserve_gap);
dp_t *const tmp = end - unsorted;
assert(dl->items + dl->length + 1 < tmp);
/* copy unsorted to the end of allocated space and sort it */
memcpy(tmp, sorted_end, unsorted * sizeof(dp_t));
dp_sort(tmp, tmp + unsorted);
/* merge two parts from end to begin */
dp_t *__restrict w = dl->items + dl->length;
dp_t *__restrict l = dl->items + dl->sorted;
dp_t *__restrict r = end - 1;
do {
const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5);
#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV
*w = cmp ? *l-- : *r--;
#else
*w = cmp ? *l : *r;
l -= cmp;
r += (ptrdiff_t)cmp - 1;
#endif
} while (likely(--w > l));
assert(r == tmp - 1);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
if (ASSERT_ENABLED())
for (size_t i = 0; i <= dl->length; ++i)
assert(dl->items[i].pgno < dl->items[i + 1].pgno);
} else {
dp_sort(dl->items + 1, dl->items + dl->length + 1);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
}
} else {
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
}
dl->sorted = dl->length;
return dl;
}
/* Returns the index of the first dirty-page whose pgno
* member is greater than or equal to id. */
#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id))
SEARCH_IMPL(dp_bsearch, dp_t, pgno_t, DP_SEARCH_CMP)
__hot __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
if (AUDIT_ENABLED()) {
for (const dp_t *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
assert(ptr[0].pgno < ptr[1].pgno);
assert(ptr[0].pgno >= NUM_METAS);
}
}
switch (dl->length - dl->sorted) {
default:
/* sort a whole */
dpl_sort_slowpath(txn);
break;
case 0:
/* whole sorted cases */
break;
#define LINEAR_SEARCH_CASE(N) \
case N: \
if (dl->items[dl->length - N + 1].pgno == pgno) \
return dl->length - N + 1; \
__fallthrough
/* use linear scan until the threshold */
LINEAR_SEARCH_CASE(7); /* fall through */
LINEAR_SEARCH_CASE(6); /* fall through */
LINEAR_SEARCH_CASE(5); /* fall through */
LINEAR_SEARCH_CASE(4); /* fall through */
LINEAR_SEARCH_CASE(3); /* fall through */
LINEAR_SEARCH_CASE(2); /* fall through */
case 1:
if (dl->items[dl->length].pgno == pgno)
return dl->length;
/* continue bsearch on the sorted part */
break;
}
return dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items;
}
const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const dpl_t *dl = txn->wr.dirtylist;
if (dl) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
for (size_t i = dl->length; i > dl->sorted; --i)
if (dl->items[i].pgno == pgno)
return dl->items[i].ptr;
if (dl->sorted) {
const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items;
if (dl->items[i].pgno == pgno)
return dl->items[i].ptr;
}
} else {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
}
return nullptr;
}
void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
assert((intptr_t)i > 0 && i <= dl->length);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
dl->pages_including_loose -= npages;
dl->sorted -= dl->sorted >= i;
dl->length -= 1;
memmove(dl->items + i, dl->items + i + 1, (dl->length - i + 2) * sizeof(dl->items[0]));
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
}
int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const dp_t dp = {page, pgno, (pgno_t)npages};
if ((txn->flags & MDBX_WRITEMAP) == 0) {
size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->wr.dirtylru;
}
dpl_t *dl = txn->wr.dirtylist;
tASSERT(txn, dl->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
if (AUDIT_ENABLED()) {
for (size_t i = dl->length; i > 0; --i) {
assert(dl->items[i].pgno != dp.pgno);
if (unlikely(dl->items[i].pgno == dp.pgno)) {
ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i);
return MDBX_PROBLEM;
}
}
}
if (unlikely(dl->length == dl->detent)) {
if (unlikely(dl->detent >= PAGELIST_LIMIT)) {
ERROR("DPL is full (PAGELIST_LIMIT %zu)", PAGELIST_LIMIT);
return MDBX_TXN_FULL;
}
const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) ? dl->detent + dl->detent : dl->detent + dl->detent / 2;
dl = dpl_reserve(txn, size);
if (unlikely(!dl))
return MDBX_ENOMEM;
tASSERT(txn, dl->length < dl->detent);
}
/* Сортировка нужна для быстрого поиска, используем несколько тактик:
* 1) Сохраняем упорядоченность при естественной вставке в нужном порядке.
* 2) Добавляем в не-сортированный хвост, который сортируем и сливаем
* с отсортированной головой по необходимости, а пока хвост короткий
* ищем в нём сканированием, избегая большой пересортировки.
* 3) Если не-сортированный хвост короткий, а добавляемый элемент близок
* к концу отсортированной головы, то выгоднее сразу вставить элемент
* в нужное место.
*
* Алгоритмически:
* - добавлять в не-сортированный хвост следует только если вставка сильно
* дорогая, т.е. если целевая позиция элемента сильно далека от конца;
* - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим
* от конца на максимально-приемлемое расстояние;
* - если список короче, либо элемент в этой позиции меньше вставляемого,
* то следует перемещать элементы и вставлять в отсортированную голову;
* - если не-сортированный хвост длиннее, либо элемент в этой позиции больше,
* то следует добавлять в не-сортированный хвост. */
dl->pages_including_loose += npages;
dp_t *i = dl->items + dl->length;
const ptrdiff_t pivot = (ptrdiff_t)dl->length - dpl_insertion_threshold;
#if MDBX_HAVE_CMOV
const pgno_t pivot_pgno =
dl->items[(dl->length < dpl_insertion_threshold) ? 0 : dl->length - dpl_insertion_threshold].pgno;
#endif /* MDBX_HAVE_CMOV */
/* copy the stub beyond the end */
i[2] = i[1];
dl->length += 1;
if (likely(pivot <= (ptrdiff_t)dl->sorted) &&
#if MDBX_HAVE_CMOV
pivot_pgno < dp.pgno) {
#else
(pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) {
#endif /* MDBX_HAVE_CMOV */
dl->sorted += 1;
/* сдвигаем несортированный хвост */
while (i >= dl->items + dl->sorted) {
#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */
i[1] = *i;
#elif MDBX_WORDBITS == 64 && (defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128))
STATIC_ASSERT(sizeof(dp) == sizeof(__uint128_t));
((__uint128_t *)i)[1] = *(volatile __uint128_t *)i;
#else
i[1].ptr = i->ptr;
i[1].pgno = i->pgno;
i[1].npages = i->npages;
#endif
--i;
}
/* ищем нужную позицию сдвигая отсортированные элементы */
while (i->pgno > pgno) {
tASSERT(txn, i > dl->items);
i[1] = *i;
--i;
}
tASSERT(txn, i->pgno < dp.pgno);
}
i[1] = dp;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
assert(dl->sorted <= dl->length);
return MDBX_SUCCESS;
}
__cold bool dpl_check(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const dpl_t *const dl = txn->wr.dirtylist;
if (!dl) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
return true;
}
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
tASSERT(txn,
txn->wr.dirtyroom + dl->length == (txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
if (!AUDIT_ENABLED())
return true;
size_t loose = 0, pages = 0;
for (size_t i = dl->length; i > 0; --i) {
const page_t *const dp = dl->items[i].ptr;
if (!dp)
continue;
tASSERT(txn, dp->pgno == dl->items[i].pgno);
if (unlikely(dp->pgno != dl->items[i].pgno))
return false;
if ((txn->flags & MDBX_WRITEMAP) == 0) {
const uint32_t age = dpl_age(txn, i);
tASSERT(txn, age < UINT32_MAX / 3);
if (unlikely(age > UINT32_MAX / 3))
return false;
}
tASSERT(txn, dp->flags == P_LOOSE || is_modifable(txn, dp));
if (dp->flags == P_LOOSE) {
loose += 1;
} else if (unlikely(!is_modifable(txn, dp)))
return false;
const unsigned num = dpl_npages(dl, i);
pages += num;
tASSERT(txn, txn->geo.first_unallocated >= dp->pgno + num);
if (unlikely(txn->geo.first_unallocated < dp->pgno + num))
return false;
if (i < dl->sorted) {
tASSERT(txn, dl->items[i + 1].pgno >= dp->pgno + num);
if (unlikely(dl->items[i + 1].pgno < dp->pgno + num))
return false;
}
const size_t rpa = pnl_search(txn->wr.repnl, dp->pgno, txn->geo.first_unallocated);
tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->wr.repnl) || txn->wr.repnl[rpa] != dp->pgno);
if (rpa <= MDBX_PNL_GETSIZE(txn->wr.repnl) && unlikely(txn->wr.repnl[rpa] == dp->pgno))
return false;
if (num > 1) {
const size_t rpb = pnl_search(txn->wr.repnl, dp->pgno + num - 1, txn->geo.first_unallocated);
tASSERT(txn, rpa == rpb);
if (unlikely(rpa != rpb))
return false;
}
}
tASSERT(txn, loose == txn->wr.loose_count);
if (unlikely(loose != txn->wr.loose_count))
return false;
tASSERT(txn, pages == dl->pages_including_loose);
if (unlikely(pages != dl->pages_including_loose))
return false;
for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->wr.retired_pages); ++i) {
const page_t *const dp = debug_dpl_find(txn, txn->wr.retired_pages[i]);
tASSERT(txn, !dp);
if (unlikely(dp))
return false;
}
return true;
}
/*----------------------------------------------------------------------------*/
__noinline void dpl_lru_reduce(MDBX_txn *txn) {
VERBOSE("lru-reduce %u -> %u", txn->wr.dirtylru, txn->wr.dirtylru >> 1);
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
do {
txn->wr.dirtylru >>= 1;
dpl_t *dl = txn->wr.dirtylist;
for (size_t i = 1; i <= dl->length; ++i) {
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr >>= 1;
}
txn = txn->parent;
} while (txn);
}
void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
if (MDBX_PNL_GETSIZE(pl) && txn->wr.dirtylist->length) {
tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->geo.first_unallocated << spilled));
dpl_t *dl = dpl_sort(txn);
/* Scanning in ascend order */
const intptr_t step = MDBX_PNL_ASCENDING ? 1 : -1;
const intptr_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pl);
const intptr_t end = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(pl) + 1 : 0;
tASSERT(txn, pl[begin] <= pl[end - step]);
size_t w, r = dpl_search(txn, pl[begin] >> spilled);
tASSERT(txn, dl->sorted == dl->length);
for (intptr_t i = begin; r <= dl->length;) { /* scan loop */
assert(i != end);
tASSERT(txn, !spilled || (pl[i] & 1) == 0);
pgno_t pl_pgno = pl[i] >> spilled;
pgno_t dp_pgno = dl->items[r].pgno;
if (likely(dp_pgno != pl_pgno)) {
const bool cmp = dp_pgno < pl_pgno;
r += cmp;
i += cmp ? 0 : step;
if (likely(i != end))
continue;
return;
}
/* update loop */
unsigned npages;
w = r;
remove_dl:
npages = dpl_npages(dl, r);
dl->pages_including_loose -= npages;
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, dl->items[r].ptr, npages);
++r;
next_i:
i += step;
if (unlikely(i == end)) {
while (r <= dl->length)
dl->items[w++] = dl->items[r++];
} else {
while (r <= dl->length) {
assert(i != end);
tASSERT(txn, !spilled || (pl[i] & 1) == 0);
pl_pgno = pl[i] >> spilled;
dp_pgno = dl->items[r].pgno;
if (dp_pgno < pl_pgno)
dl->items[w++] = dl->items[r++];
else if (dp_pgno > pl_pgno)
goto next_i;
else
goto remove_dl;
}
}
dl->sorted = dpl_setlen(dl, w - 1);
txn->wr.dirtyroom += r - w;
tASSERT(txn, txn->wr.dirtyroom + txn->wr.dirtylist->length ==
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
return;
}
}
}
void dpl_release_shadows(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
MDBX_env *env = txn->env;
dpl_t *const dl = txn->wr.dirtylist;
for (size_t i = 1; i <= dl->length; i++)
page_shadow_release(env, dl->items[i].ptr, dpl_npages(dl, i));
dpl_clear(dl);
}

134
src/dpl.h
View File

@ -1,134 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
static inline size_t dpl_setlen(dpl_t *dl, size_t len) {
static const page_t dpl_stub_pageE = {INVALID_TXNID,
0,
P_BAD,
{0},
/* pgno */ ~(pgno_t)0};
assert(dpl_stub_pageE.flags == P_BAD && dpl_stub_pageE.pgno == P_INVALID);
dl->length = len;
dl->items[len + 1].ptr = (page_t *)&dpl_stub_pageE;
dl->items[len + 1].pgno = P_INVALID;
dl->items[len + 1].npages = 1;
return len;
}
static inline void dpl_clear(dpl_t *dl) {
static const page_t dpl_stub_pageB = {INVALID_TXNID,
0,
P_BAD,
{0},
/* pgno */ 0};
assert(dpl_stub_pageB.flags == P_BAD && dpl_stub_pageB.pgno == 0);
dl->sorted = dpl_setlen(dl, 0);
dl->pages_including_loose = 0;
dl->items[0].ptr = (page_t *)&dpl_stub_pageB;
dl->items[0].pgno = 0;
dl->items[0].npages = 1;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
}
MDBX_INTERNAL int __must_check_result dpl_alloc(MDBX_txn *txn);
MDBX_INTERNAL void dpl_free(MDBX_txn *txn);
MDBX_INTERNAL dpl_t *dpl_reserve(MDBX_txn *txn, size_t size);
MDBX_INTERNAL __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn);
static inline dpl_t *dpl_sort(const MDBX_txn *txn) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
tASSERT(txn, dl->length <= PAGELIST_LIMIT);
tASSERT(txn, dl->sorted <= dl->length);
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn);
}
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
MDBX_MAYBE_UNUSED MDBX_INTERNAL const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno);
MDBX_NOTHROW_PURE_FUNCTION static inline unsigned dpl_npages(const dpl_t *dl, size_t i) {
assert(0 <= (intptr_t)i && i <= dl->length);
unsigned n = dl->items[i].npages;
assert(n == (is_largepage(dl->items[i].ptr) ? dl->items[i].ptr->pages : 1));
return n;
}
MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t dpl_endpgno(const dpl_t *dl, size_t i) {
return dpl_npages(dl, i) + dl->items[i].pgno;
}
MDBX_NOTHROW_PURE_FUNCTION static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
tASSERT(txn, dl->sorted == dl->length);
tASSERT(txn, dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
size_t const n = dpl_search(txn, pgno);
tASSERT(txn, n >= 1 && n <= dl->length + 1);
tASSERT(txn, pgno <= dl->items[n].pgno);
tASSERT(txn, pgno > dl->items[n - 1].pgno);
const bool rc =
/* intersection with founded */ pgno + npages > dl->items[n].pgno ||
/* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno;
if (ASSERT_ENABLED()) {
bool check = false;
for (size_t i = 1; i <= dl->length; ++i) {
const page_t *const dp = dl->items[i].ptr;
if (!(dp->pgno /* begin */ >= /* end */ pgno + npages || dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno))
check |= true;
}
tASSERT(txn, check == rc);
}
return rc;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn, pgno_t pgno) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
dpl_t *dl = txn->wr.dirtylist;
size_t i = dpl_search(txn, pgno);
tASSERT(txn, (int)i > 0);
return (dl->items[i].pgno == pgno) ? i : 0;
}
MDBX_INTERNAL void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages);
static inline void dpl_remove(const MDBX_txn *txn, size_t i) {
dpl_remove_ex(txn, i, dpl_npages(txn->wr.dirtylist, i));
}
MDBX_INTERNAL int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, size_t npages);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool dpl_check(MDBX_txn *txn);
MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) {
tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
const dpl_t *dl = txn->wr.dirtylist;
assert((intptr_t)i > 0 && i <= dl->length);
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
return txn->wr.dirtylru - (uint32_t)*ptr;
}
MDBX_INTERNAL void dpl_lru_reduce(MDBX_txn *txn);
static inline uint32_t dpl_lru_turn(MDBX_txn *txn) {
txn->wr.dirtylru += 1;
if (unlikely(txn->wr.dirtylru > UINT32_MAX / 3) && (txn->flags & MDBX_WRITEMAP) == 0)
dpl_lru_reduce(txn);
return txn->wr.dirtylru;
}
MDBX_INTERNAL void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled);
MDBX_INTERNAL void dpl_release_shadows(MDBX_txn *txn);

1337
src/dxb.c

File diff suppressed because it is too large Load Diff

602
src/env.c
View File

@ -1,602 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
MDBX_txn *env_owned_wrtxn(const MDBX_env *env) {
if (likely(env->basal_txn)) {
const bool is_owned = (env->flags & MDBX_NOSTICKYTHREADS) ? (env->basal_txn->owner != 0)
: (env->basal_txn->owner == osal_thread_self());
if (is_owned)
return env->txn ? env->txn : env->basal_txn;
}
return nullptr;
}
int env_page_auxbuffer(MDBX_env *env) {
const int err = env->page_auxbuf
? MDBX_SUCCESS
: osal_memalign_alloc(globals.sys_pagesize, env->ps * (size_t)NUM_METAS, &env->page_auxbuf);
if (likely(err == MDBX_SUCCESS)) {
memset(env->page_auxbuf, -1, env->ps * (size_t)2);
memset(ptr_disp(env->page_auxbuf, env->ps * (size_t)2), 0, env->ps);
}
return err;
}
__cold unsigned env_setup_pagesize(MDBX_env *env, const size_t pagesize) {
STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE);
STATIC_ASSERT(MDBX_MIN_PAGESIZE > sizeof(page_t) + sizeof(meta_t));
ENSURE(env, is_powerof2(pagesize));
ENSURE(env, pagesize >= MDBX_MIN_PAGESIZE);
ENSURE(env, pagesize <= MDBX_MAX_PAGESIZE);
ENSURE(env, !env->page_auxbuf && env->ps != pagesize);
env->ps = (unsigned)pagesize;
STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MIN_PAGESIZE) > 4);
STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MAX_PAGESIZE) < PAGELIST_LIMIT);
const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
ENSURE(env, maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)PAGELIST_LIMIT / 4);
env->maxgc_large1page = (unsigned)maxgc_ov1page;
env->maxgc_per_branch = (unsigned)((pagesize - PAGEHDRSZ) / (sizeof(indx_t) + sizeof(node_t) + sizeof(txnid_t)));
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) > sizeof(tree_t) + NODESIZE + 42);
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX);
STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) >= BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE));
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) > NODESIZE + 42);
STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX);
const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize);
const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize);
ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) && branch_nodemax % 2 == 0 &&
leaf_nodemax > (intptr_t)(sizeof(tree_t) + NODESIZE + 42) && leaf_nodemax >= branch_nodemax &&
leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0);
env->leaf_nodemax = (uint16_t)leaf_nodemax;
env->branch_nodemax = (uint16_t)branch_nodemax;
env->ps2ln = (uint8_t)log2n_powerof2(pagesize);
eASSERT(env, pgno2bytes(env, 1) == pagesize);
eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2);
recalculate_merge_thresholds(env);
recalculate_subpage_thresholds(env);
env_options_adjust_dp_limit(env);
return env->ps;
}
__cold int env_sync(MDBX_env *env, bool force, bool nonblock) {
if (unlikely(env->flags & MDBX_RDONLY))
return MDBX_EACCESS;
MDBX_txn *const txn_owned = env_owned_wrtxn(env);
bool should_unlock = false;
int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */;
retry:;
unsigned flags = env->flags & ~(MDBX_NOMETASYNC | txn_shrink_allowed);
if (unlikely((flags & (ENV_FATAL_ERROR | ENV_ACTIVE)) != ENV_ACTIVE)) {
rc = (flags & ENV_FATAL_ERROR) ? MDBX_PANIC : MDBX_EPERM;
goto bailout;
}
const troika_t troika = (txn_owned || should_unlock) ? env->basal_txn->wr.troika : meta_tap(env);
const meta_ptr_t head = meta_recent(env, &troika);
const uint64_t unsynced_pages = atomic_load64(&env->lck->unsynced_pages, mo_Relaxed);
if (unsynced_pages == 0) {
const uint32_t synched_meta_txnid_u32 = atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady)
goto bailout;
}
if (should_unlock && (env->flags & MDBX_WRITEMAP) &&
unlikely(head.ptr_c->geometry.first_unallocated > bytes2pgno(env, env->dxb_mmap.current))) {
if (unlikely(env->stuck_meta >= 0) && troika.recent != (uint8_t)env->stuck_meta) {
NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent "
"meta-page (%u)",
"sync datafile", env->stuck_meta, troika.recent);
rc = MDBX_RESULT_TRUE;
} else {
rc = dxb_resize(env, head.ptr_c->geometry.first_unallocated, head.ptr_c->geometry.now, head.ptr_c->geometry.upper,
implicit_grow);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
}
const size_t autosync_threshold = atomic_load32(&env->lck->autosync_threshold, mo_Relaxed);
const uint64_t autosync_period = atomic_load64(&env->lck->autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
(autosync_period && (eoos_timestamp = atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) &&
osal_monotime() - eoos_timestamp >= autosync_period))
flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
if (!txn_owned) {
if (!should_unlock) {
#if MDBX_ENABLE_PGOP_STAT
unsigned wops = 0;
#endif /* MDBX_ENABLE_PGOP_STAT */
int err;
/* pre-sync to avoid latency for writer */
if (unsynced_pages > /* FIXME: define threshold */ 42 && (flags & MDBX_SAFE_NOSYNC) == 0) {
eASSERT(env, ((flags ^ env->flags) & MDBX_WRITEMAP) == 0);
if (flags & MDBX_WRITEMAP) {
/* Acquire guard to avoid collision with remap */
#if defined(_WIN32) || defined(_WIN64)
imports.srwl_AcquireShared(&env->remap_guard);
#else
err = osal_fastmutex_acquire(&env->remap_guard);
if (unlikely(err != MDBX_SUCCESS))
return err;
#endif
const size_t usedbytes = pgno_align2os_bytes(env, head.ptr_c->geometry.first_unallocated);
err = osal_msync(&env->dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA);
#if defined(_WIN32) || defined(_WIN64)
imports.srwl_ReleaseShared(&env->remap_guard);
#else
int unlock_err = osal_fastmutex_release(&env->remap_guard);
if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS)
err = unlock_err;
#endif
} else
err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA);
if (unlikely(err != MDBX_SUCCESS))
return err;
#if MDBX_ENABLE_PGOP_STAT
wops = 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
/* pre-sync done */
rc = MDBX_SUCCESS /* means "some data was synced" */;
}
err = lck_txn_lock(env, nonblock);
if (unlikely(err != MDBX_SUCCESS))
return err;
should_unlock = true;
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.wops.weak += wops;
#endif /* MDBX_ENABLE_PGOP_STAT */
env->basal_txn->wr.troika = meta_tap(env);
eASSERT(env, !env->txn && !env->basal_txn->nested);
goto retry;
}
eASSERT(env, head.txnid == recent_committed_txnid(env));
env->basal_txn->txnid = head.txnid;
txn_gc_detent(env->basal_txn);
flags |= txn_shrink_allowed;
}
eASSERT(env, txn_owned || should_unlock);
eASSERT(env, !txn_owned || (flags & txn_shrink_allowed) == 0);
if (!head.is_steady && unlikely(env->stuck_meta >= 0) && troika.recent != (uint8_t)env->stuck_meta) {
NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent "
"meta-page (%u)",
"sync datafile", env->stuck_meta, troika.recent);
rc = MDBX_RESULT_TRUE;
goto bailout;
}
if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, data_page(head.ptr_c)->pgno,
durable_caption(head.ptr_c), unsynced_pages);
meta_t meta = *head.ptr_c;
rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->wr.troika);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
/* LY: sync meta-pages if MDBX_NOMETASYNC enabled
* and someone was not synced above. */
if (atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) != (uint32_t)head.txnid)
rc = meta_sync(env, head);
bailout:
if (should_unlock)
lck_txn_unlock(env);
return rc;
}
__cold int env_open(MDBX_env *env, mdbx_mode_t mode) {
/* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH:
*
* 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС
* придется чаще обновлять страницы в unified page cache.
*
* Однако, O_DSYNC не предполагает отключение unified page cache,
* поэтому подобные затруднения будем считать проблемой ОС и/или
* ожидаемым пенальти из-за использования мелких страниц БД.
*
* 1) В режиме MDBX_SYNC_DURABLE - O_DSYNC для записи как данных,
* так и мета-страниц. Однако, на Linux отказ от O_DSYNC с последующим
* fdatasync() может быть выгоднее при использовании HDD, так как
* позволяет io-scheduler переупорядочить запись с учетом актуального
* расположения файла БД на носителе.
*
* 2) В режиме MDBX_NOMETASYNC - O_DSYNC можно использовать для данных,
* но в этом может не быть смысла, так как fdatasync() всё равно
* требуется для гарантии фиксации мета после предыдущей транзакции.
*
* В итоге на нормальных системах (не Windows) есть два варианта:
* - при возможности O_DIRECT и/или io_ring для данных, скорее всего,
* есть смысл вызвать fdatasync() перед записью данных, а затем
* использовать O_DSYNC;
* - не использовать O_DSYNC и вызывать fdatasync() после записи данных.
*
* На Windows же следует минимизировать использование FlushFileBuffers()
* из-за проблем с производительностью. Поэтому на Windows в режиме
* MDBX_NOMETASYNC:
* - мета обновляется через дескриптор без FILE_FLAG_WRITE_THROUGH;
* - перед началом записи данных вызывается FlushFileBuffers(), если
* meta_sync_txnid не совпадает с последней записанной мета;
* - данные записываются через дескриптор с FILE_FLAG_WRITE_THROUGH.
*
* 3) В режиме MDBX_SAFE_NOSYNC - O_DSYNC нет смысла использовать, пока не
* будет реализована возможность полностью асинхронной "догоняющей"
* записи в выделенном процессе-сервере с io-ring очередями внутри.
*
* -----
*
* Использование O_DIRECT или FILE_FLAG_NO_BUFFERING:
*
* Назначение этих флагов в отключении файлового дескриптора от
* unified page cache, т.е. от отображенных в память данных в случае
* libmdbx.
*
* Поэтому, использование direct i/o в libmdbx без MDBX_WRITEMAP лишено
* смысла и контр-продуктивно, ибо так мы провоцируем ядро ОС на
* не-когерентность отображения в память с содержимым файла на носителе,
* либо требуем дополнительных проверок и действий направленных на
* фактическое отключение O_DIRECT для отображенных в память данных.
*
* В режиме MDBX_WRITEMAP когерентность отображенных данных обеспечивается
* физически. Поэтому использование direct i/o может иметь смысл, если у
* ядра ОС есть какие-то проблемы с msync(), в том числе с
* производительностью:
* - использование io_ring или gather-write может быть дешевле, чем
* просмотр PTE ядром и запись измененных/грязных;
* - но проблема в том, что записываемые из user mode страницы либо не
* будут помечены чистыми (и соответственно будут записаны ядром
* еще раз), либо ядру необходимо искать и чистить PTE при получении
* запроса на запись.
*
* Поэтому O_DIRECT или FILE_FLAG_NO_BUFFERING используется:
* - только в режиме MDBX_SYNC_DURABLE с MDBX_WRITEMAP;
* - когда ps >= me_os_psize;
* - опция сборки MDBX_AVOID_MSYNC != 0, которая по-умолчанию включена
* только на Windows (см ниже).
*
* -----
*
* Использование FILE_FLAG_OVERLAPPED на Windows:
*
* У Windows очень плохо с I/O (за исключением прямых постраничных
* scatter/gather, которые работают в обход проблемного unified page
* cache и поэтому почти бесполезны в libmdbx).
*
* При этом всё еще хуже при использовании FlushFileBuffers(), что также
* требуется после FlushViewOfFile() в режиме MDBX_WRITEMAP. Поэтому
* на Windows вместо FlushViewOfFile() и FlushFileBuffers() следует
* использовать запись через дескриптор с FILE_FLAG_WRITE_THROUGH.
*
* В свою очередь, запись с FILE_FLAG_WRITE_THROUGH дешевле/быстрее
* при использовании FILE_FLAG_OVERLAPPED. В результате, на Windows
* в durable-режимах запись данных всегда в overlapped-режиме,
* при этом для записи мета требуется отдельный не-overlapped дескриптор.
*/
env->pid = osal_getpid();
int rc = osal_openfile((env->flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ : MDBX_OPEN_DXB_LAZY, env, env->pathname.dxb,
&env->lazy_fd, mode);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
env->me_sysv_ipc.key = ftok(env->pathname.dxb, 42);
if (unlikely(env->me_sysv_ipc.key == -1))
return errno;
#endif /* MDBX_LOCKING */
/* Set the position in files outside of the data to avoid corruption
* due to erroneous use of file descriptors in the application code. */
const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000);
osal_fseek(env->lazy_fd, safe_parking_lot_offset);
env->fd4meta = env->lazy_fd;
#if defined(_WIN32) || defined(_WIN64)
eASSERT(env, env->ioring.overlapped_fd == 0);
bool ior_direct = false;
if (!(env->flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) {
if (MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) {
/* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции
* MDBX_AVOID_MSYNC.
*
* 1) В этой комбинации наиболее выгодно использовать WriteFileGather(),
* но для этого необходимо открыть файл с флагом FILE_FLAG_NO_BUFFERING и
* после обеспечивать выравнивание адресов и размера данных на границу
* системной страницы, что в свою очередь возможно если размер страницы БД
* не меньше размера системной страницы ОЗУ. Поэтому для открытия файла в
* нужном режиме требуется знать размер страницы БД.
*
* 2) Кроме этого, в Windows запись в заблокированный регион файла
* возможно только через тот-же дескриптор. Поэтому изначальный захват
* блокировок посредством lck_seize(), захват/освобождение блокировок
* во время пишущих транзакций и запись данных должны выполнятся через
* один дескриптор.
*
* Таким образом, требуется прочитать волатильный заголовок БД, чтобы
* узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном
* для записи данных, чтобы использовать именно этот дескриптор для
* изначального захвата блокировок. */
meta_t header;
uint64_t dxb_filesize;
int err = dxb_read_header(env, &header, MDBX_SUCCESS, true);
if ((err == MDBX_SUCCESS && header.pagesize >= globals.sys_pagesize) ||
(err == MDBX_ENODATA && mode && env->ps >= globals.sys_pagesize &&
osal_filesize(env->lazy_fd, &dxb_filesize) == MDBX_SUCCESS && dxb_filesize == 0))
/* Может быть коллизия, если два процесса пытаются одновременно создать
* БД с разным размером страницы, который у одного меньше системной
* страницы, а у другого НЕ меньше. Эта допустимая, но очень странная
* ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */
ior_direct = true;
}
rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT : MDBX_OPEN_DXB_OVERLAPPED, env, env->pathname.dxb,
&env->ioring.overlapped_fd, 0);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
env->dxb_lock_event = CreateEventW(nullptr, true, false, nullptr);
if (unlikely(!env->dxb_lock_event))
return (int)GetLastError();
osal_fseek(env->ioring.overlapped_fd, safe_parking_lot_offset);
}
#else
if (mode == 0) {
/* pickup mode for lck-file */
struct stat st;
if (unlikely(fstat(env->lazy_fd, &st)))
return errno;
mode = st.st_mode;
}
mode = (/* inherit read permissions for group and others */ mode & (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) |
/* always add read/write for owner */ S_IRUSR | S_IWUSR |
((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) |
((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0);
#endif /* !Windows */
const int lck_rc = lck_setup(env, mode);
if (unlikely(MDBX_IS_ERROR(lck_rc)))
return lck_rc;
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE)
osal_fseek(env->lck_mmap.fd, safe_parking_lot_offset);
eASSERT(env, env->dsync_fd == INVALID_HANDLE_VALUE);
if (!(env->flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | DEPRECATED_MAPASYNC
#if defined(_WIN32) || defined(_WIN64)
| MDBX_EXCLUSIVE
#endif /* !Windows */
))) {
rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env->pathname.dxb, &env->dsync_fd, 0);
if (unlikely(MDBX_IS_ERROR(rc)))
return rc;
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
if ((env->flags & MDBX_NOMETASYNC) == 0)
env->fd4meta = env->dsync_fd;
osal_fseek(env->dsync_fd, safe_parking_lot_offset);
}
}
const MDBX_env_flags_t lazy_flags = MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC;
const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM | MDBX_NORDAHEAD | MDBX_RDONLY | MDBX_WRITEMAP;
lck_t *const lck = env->lck_mmap.lck;
if (lck && lck_rc != MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) {
MDBX_env_flags_t snap_flags;
while ((snap_flags = atomic_load32(&lck->envmode, mo_AcquireRelease)) == MDBX_RDONLY) {
if (atomic_cas32(&lck->envmode, MDBX_RDONLY, (snap_flags = (env->flags & mode_flags)))) {
/* The case:
* - let's assume that for some reason the DB file is smaller
* than it should be according to the geometry,
* but not smaller than the last page used;
* - the first process that opens the database (lck_rc == RESULT_TRUE)
* does this in readonly mode and therefore cannot bring
* the file size back to normal;
* - some next process (lck_rc != RESULT_TRUE) opens the DB in
* read-write mode and now is here.
*
* FIXME: Should we re-check and set the size of DB-file right here? */
break;
}
atomic_yield();
}
if (env->flags & MDBX_ACCEDE) {
/* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */
const MDBX_env_flags_t diff =
(snap_flags ^ env->flags) & ((snap_flags & lazy_flags) ? mode_flags : mode_flags & ~MDBX_WRITEMAP);
env->flags ^= diff;
NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->flags ^ diff, env->flags);
}
/* Ранее упущенный не очевидный момент: При работе БД в режимах
* не-синхронной/отложенной фиксации на диске, все процессы-писатели должны
* иметь одинаковый режим MDBX_WRITEMAP.
*
* В противном случае, сброс на диск следует выполнять дважды: сначала
* msync(), затем fdatasync(). При этом msync() не обязан отрабатывать
* в процессах без MDBX_WRITEMAP, так как файл в память отображен только
* для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не
* позволяют выполнить фиксацию данных на диск, после их изменения в другом
* процессе.
*
* В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP
* также не следует, поскольку никакой процесс (в том числе последний) не
* может гарантированно сбросить данные на диск, а следовательно не должен
* помечать какую-либо транзакцию как steady.
*
* В результате, требуется либо запретить совместную работу процессам с
* разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое
* смешивание и блокировать steady-пометки - что контрпродуктивно. */
const MDBX_env_flags_t rigorous_flags = (snap_flags & lazy_flags)
? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP
: MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC;
const MDBX_env_flags_t rigorous_diff = (snap_flags ^ env->flags) & rigorous_flags;
if (rigorous_diff) {
ERROR("current mode/flags 0x%X incompatible with requested 0x%X, "
"rigorous diff 0x%X",
env->flags, snap_flags, rigorous_diff);
return MDBX_INCOMPATIBLE;
}
}
mincore_clean_cache(env);
const int dxb_rc = dxb_setup(env, lck_rc, mode);
if (MDBX_IS_ERROR(dxb_rc))
return dxb_rc;
rc = osal_check_fs_incore(env->lazy_fd);
env->incore = false;
if (rc == MDBX_RESULT_TRUE) {
env->incore = true;
NOTICE("%s", "in-core database");
rc = MDBX_SUCCESS;
} else if (unlikely(rc != MDBX_SUCCESS)) {
ERROR("check_fs_incore(), err %d", rc);
return rc;
}
if (unlikely(/* recovery mode */ env->stuck_meta >= 0) &&
(lck_rc != /* exclusive */ MDBX_RESULT_TRUE || (env->flags & MDBX_EXCLUSIVE) == 0)) {
ERROR("%s", "recovery requires exclusive mode");
return MDBX_BUSY;
}
DEBUG("opened dbenv %p", (void *)env);
env->flags |= ENV_ACTIVE;
if (!lck || lck_rc == MDBX_RESULT_TRUE) {
env->lck->envmode.weak = env->flags & mode_flags;
env->lck->meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env);
env->lck->readers_check_timestamp.weak = osal_monotime();
}
if (lck) {
if (lck_rc == MDBX_RESULT_TRUE) {
rc = lck_downgrade(env);
DEBUG("lck-downgrade-%s: rc %i", (env->flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc);
if (rc != MDBX_SUCCESS)
return rc;
} else {
rc = mvcc_cleanup_dead(env, false, nullptr);
if (MDBX_IS_ERROR(rc))
return rc;
}
}
rc = (env->flags & MDBX_RDONLY) ? MDBX_SUCCESS
: osal_ioring_create(&env->ioring
#if defined(_WIN32) || defined(_WIN64)
,
ior_direct, env->ioring.overlapped_fd
#endif /* Windows */
);
return rc;
}
__cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
const unsigned flags = env->flags;
env->flags &= ~ENV_INTERNAL_FLAGS;
if (flags & ENV_TXKEY) {
thread_key_delete(env->me_txkey);
env->me_txkey = 0;
}
if (env->lck)
munlock_all(env);
rthc_lock();
int rc = rthc_remove(env);
rthc_unlock();
#if MDBX_ENABLE_DBI_LOCKFREE
for (defer_free_item_t *next, *ptr = env->defer_free; ptr; ptr = next) {
next = ptr->next;
osal_free(ptr);
}
env->defer_free = nullptr;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
if ((env->flags & MDBX_RDONLY) == 0)
osal_ioring_destroy(&env->ioring);
env->lck = nullptr;
if (env->lck_mmap.lck)
osal_munmap(&env->lck_mmap);
if (env->dxb_mmap.base) {
osal_munmap(&env->dxb_mmap);
#ifdef ENABLE_MEMCHECK
VALGRIND_DISCARD(env->valgrind_handle);
env->valgrind_handle = -1;
#endif /* ENABLE_MEMCHECK */
}
#if defined(_WIN32) || defined(_WIN64)
eASSERT(env, !env->ioring.overlapped_fd || env->ioring.overlapped_fd == INVALID_HANDLE_VALUE);
if (env->dxb_lock_event != INVALID_HANDLE_VALUE) {
CloseHandle(env->dxb_lock_event);
env->dxb_lock_event = INVALID_HANDLE_VALUE;
}
eASSERT(env, !resurrect_after_fork);
if (env->pathname_char) {
osal_free(env->pathname_char);
env->pathname_char = nullptr;
}
#endif /* Windows */
if (env->dsync_fd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->dsync_fd);
env->dsync_fd = INVALID_HANDLE_VALUE;
}
if (env->lazy_fd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->lazy_fd);
env->lazy_fd = INVALID_HANDLE_VALUE;
}
if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->lck_mmap.fd);
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
}
if (!resurrect_after_fork) {
if (env->kvs) {
for (size_t i = CORE_DBS; i < env->n_dbi; ++i)
if (env->kvs[i].name.iov_len)
osal_free(env->kvs[i].name.iov_base);
osal_free(env->kvs);
env->n_dbi = CORE_DBS;
env->kvs = nullptr;
}
if (env->page_auxbuf) {
osal_memalign_free(env->page_auxbuf);
env->page_auxbuf = nullptr;
}
if (env->dbi_seqs) {
osal_free(env->dbi_seqs);
env->dbi_seqs = nullptr;
}
if (env->dbs_flags) {
osal_free(env->dbs_flags);
env->dbs_flags = nullptr;
}
if (env->pathname.buffer) {
osal_free(env->pathname.buffer);
env->pathname.buffer = nullptr;
}
if (env->basal_txn) {
txn_basal_destroy(env->basal_txn);
env->basal_txn = nullptr;
}
}
env->stuck_meta = -1;
return rc;
}

View File

@ -1,125 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#define LIBMDBX_INTERNALS
#define MDBX_DEPRECATED
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
#include "preface.h"
#ifdef xMDBX_ALLOY
/* Amalgamated build */
#define MDBX_INTERNAL static
#else
/* Non-amalgamated build */
#define MDBX_INTERNAL
#endif /* xMDBX_ALLOY */
#include "../mdbx.h"
/*----------------------------------------------------------------------------*/
/* Basic constants and types */
typedef struct iov_ctx iov_ctx_t;
#include "osal.h"
#include "options.h"
#include "atomics-types.h"
#include "layout-dxb.h"
#include "layout-lck.h"
#define MIN_MAPSIZE (MDBX_MIN_PAGESIZE * MIN_PAGENO)
#if defined(_WIN32) || defined(_WIN64)
#define MAX_MAPSIZE32 UINT32_C(0x38000000)
#else
#define MAX_MAPSIZE32 UINT32_C(0x7f000000)
#endif
#define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MDBX_MAX_PAGESIZE)
#if MDBX_WORDBITS >= 64
#define MAX_MAPSIZE MAX_MAPSIZE64
#define PAGELIST_LIMIT ((size_t)MAX_PAGENO)
#else
#define MAX_MAPSIZE MAX_MAPSIZE32
#define PAGELIST_LIMIT (MAX_MAPSIZE32 / MDBX_MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482
#define MEGABYTE ((size_t)1 << 20)
/*----------------------------------------------------------------------------*/
union logger_union {
void *ptr;
MDBX_debug_func *fmt;
MDBX_debug_func_nofmt *nofmt;
};
struct libmdbx_globals {
bin128_t bootid;
unsigned sys_pagesize, sys_allocation_granularity;
uint8_t sys_pagesize_ln2;
uint8_t runtime_flags;
uint8_t loglevel;
#if defined(_WIN32) || defined(_WIN64)
bool running_under_Wine;
#elif defined(__linux__) || defined(__gnu_linux__)
bool running_on_WSL1 /* Windows Subsystem 1 for Linux */;
uint32_t linux_kernel_version;
#endif /* Linux */
union logger_union logger;
osal_fastmutex_t debug_lock;
size_t logger_buffer_size;
char *logger_buffer;
};
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
extern struct libmdbx_globals globals;
#if defined(_WIN32) || defined(_WIN64)
extern struct libmdbx_imports imports;
#endif /* Windows */
#include "logging_and_debug.h"
#include "utils.h"
#include "pnl.h"
#ifdef __cplusplus
}
#endif /* __cplusplus */
#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
#if defined(xMDBX_TOOLS)
extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif
#define MDBX_IS_ERROR(rc) ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
/*----------------------------------------------------------------------------*/
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED static inline pgno_t int64pgno(int64_t i64) {
if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1))
return (pgno_t)i64;
return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO;
}
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED static inline pgno_t pgno_add(size_t base, size_t augend) {
assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO);
return int64pgno((int64_t)base + (int64_t)augend);
}
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED static inline pgno_t pgno_sub(size_t base, size_t subtrahend) {
assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO);
return int64pgno((int64_t)base - (int64_t)subtrahend);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,80 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
/* Гистограмма решения нарезки фрагментов для ситуации нехватки идентификаторов/слотов. */
typedef struct gc_dense_histogram {
/* Размер массива одновременно задаёт максимальный размер последовательностей,
* с которыми решается задача распределения.
*
* Использование длинных последовательностей контрпродуктивно, так как такие последовательности будут
* создавать/воспроизводить/повторять аналогичные затруднения при последующей переработке. Однако,
* в редких ситуациях это может быть единственным выходом. */
unsigned end;
pgno_t array[31];
} gc_dense_histogram_t;
typedef struct gc_update_context {
unsigned loop;
unsigned goodchunk;
bool dense;
pgno_t prev_first_unallocated;
size_t retired_stored;
size_t return_reserved_lo, return_reserved_hi;
txnid_t gc_first;
intptr_t return_left;
#ifndef MDBX_DEBUG_GCU
#define MDBX_DEBUG_GCU 0
#endif
#if MDBX_DEBUG_GCU
struct {
txnid_t prev;
unsigned n;
} dbg;
#endif /* MDBX_DEBUG_GCU */
rkl_t ready4reuse, sequel;
#if MDBX_ENABLE_BIGFOOT
txnid_t bigfoot;
#endif /* MDBX_ENABLE_BIGFOOT */
union {
MDBX_cursor cursor;
cursor_couple_t couple;
};
gc_dense_histogram_t dense_histogram;
} gcu_t;
MDBX_INTERNAL int gc_put_init(MDBX_txn *txn, gcu_t *ctx);
MDBX_INTERNAL void gc_put_destroy(gcu_t *ctx);
#define ALLOC_DEFAULT 0 /* штатное/обычное выделение страниц */
#define ALLOC_UNIMPORTANT 1 /* запрос неважен, невозможность выделения не приведет к ошибке транзакции */
#define ALLOC_RESERVE 2 /* подготовка резерва для обновления GC, без аллокации */
#define ALLOC_COALESCE 4 /* внутреннее состояние/флажок */
#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние/флажок */
#define ALLOC_LIFO 16 /* внутреннее состояние/флажок */
MDBX_INTERNAL pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags);
MDBX_INTERNAL pgr_t gc_alloc_single(const MDBX_cursor *const mc);
MDBX_INTERNAL int gc_update(MDBX_txn *txn, gcu_t *ctx);
MDBX_NOTHROW_PURE_FUNCTION static inline size_t gc_stockpile(const MDBX_txn *txn) {
return MDBX_PNL_GETSIZE(txn->wr.repnl) + txn->wr.loose_count;
}
MDBX_NOTHROW_PURE_FUNCTION static inline size_t gc_chunk_bytes(const size_t chunk) {
return (chunk + 1) * sizeof(pgno_t);
}
MDBX_INTERNAL bool gc_repnl_has_span(const MDBX_txn *txn, const size_t num);
static inline bool gc_is_reclaimed(const MDBX_txn *txn, const txnid_t id) {
return rkl_contain(&txn->wr.gc.reclaimed, id) || rkl_contain(&txn->wr.gc.comeback, id);
}
static inline txnid_t txnid_min(txnid_t a, txnid_t b) { return (a < b) ? a : b; }
static inline txnid_t txnid_max(txnid_t a, txnid_t b) { return (a > b) ? a : b; }

View File

@ -1,474 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
static void mdbx_init(void);
static void mdbx_fini(void);
/*----------------------------------------------------------------------------*/
/* mdbx constructor/destructor */
#if defined(_WIN32) || defined(_WIN64)
#if MDBX_BUILD_SHARED_LIBRARY
#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG)
/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks.
*
* Define dll's entry point only for Release build when NDEBUG is defined and
* MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will
* automatically use DllMainCRTStartup() from CRT library, which also
* automatically call DllMain() from our mdbx.dll */
#pragma comment(linker, "/ENTRY:DllMain")
#endif /* MDBX_WITHOUT_MSVC_CRT */
BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
#else
#if !MDBX_MANUAL_MODULE_HANDLER
static
#endif /* !MDBX_MANUAL_MODULE_HANDLER */
void NTAPI
mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
#endif /* MDBX_BUILD_SHARED_LIBRARY */
{
(void)reserved;
switch (reason) {
case DLL_PROCESS_ATTACH:
windows_import();
mdbx_init();
break;
case DLL_PROCESS_DETACH:
mdbx_fini();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
rthc_thread_dtor(module);
break;
}
#if MDBX_BUILD_SHARED_LIBRARY
return TRUE;
#endif
}
#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER
/* *INDENT-OFF* */
/* clang-format off */
#if defined(_MSC_VER)
# pragma const_seg(push)
# pragma data_seg(push)
# ifndef _M_IX86
/* kick a linker to create the TLS directory if not already done */
# pragma comment(linker, "/INCLUDE:_tls_used")
/* Force some symbol references. */
# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor")
/* specific const-segment for WIN64 */
# pragma const_seg(".CRT$XLB")
const
# else
/* kick a linker to create the TLS directory if not already done */
# pragma comment(linker, "/INCLUDE:__tls_used")
/* Force some symbol references. */
# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor")
/* specific data-segment for WIN32 */
# pragma data_seg(".CRT$XLB")
# endif
__declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler;
# pragma data_seg(pop)
# pragma const_seg(pop)
#elif defined(__GNUC__)
# ifndef _M_IX86
const
# endif
PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler;
#else
# error FIXME
#endif
/* *INDENT-ON* */
/* clang-format on */
#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */
#else
#if defined(__linux__) || defined(__gnu_linux__)
#include <sys/utsname.h>
MDBX_EXCLUDE_FOR_GPROF
__cold static uint8_t probe_for_WSL(const char *tag) {
const char *const WSL = strstr(tag, "WSL");
if (WSL && WSL[3] >= '2' && WSL[3] <= '9')
return WSL[3] - '0';
const char *const wsl = strstr(tag, "wsl");
if (wsl && wsl[3] >= '2' && wsl[3] <= '9')
return wsl[3] - '0';
if (WSL || wsl || strcasestr(tag, "Microsoft"))
/* Expecting no new kernel within WSL1, either it will explicitly
* marked by an appropriate WSL-version hint. */
return (globals.linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2;
return 0;
}
#endif /* Linux */
#ifdef ENABLE_GPROF
extern void _mcleanup(void);
extern void monstartup(unsigned long, unsigned long);
extern void _init(void);
extern void _fini(void);
extern void __gmon_start__(void) __attribute__((__weak__));
#endif /* ENABLE_GPROF */
MDBX_EXCLUDE_FOR_GPROF
__cold static __attribute__((__constructor__)) void mdbx_global_constructor(void) {
#ifdef ENABLE_GPROF
if (!&__gmon_start__)
monstartup((uintptr_t)&_init, (uintptr_t)&_fini);
#endif /* ENABLE_GPROF */
#if defined(__linux__) || defined(__gnu_linux__)
struct utsname buffer;
if (uname(&buffer) == 0) {
int i = 0;
char *p = buffer.release;
while (*p && i < 4) {
if (*p >= '0' && *p <= '9') {
long number = strtol(p, &p, 10);
if (number > 0) {
if (number > 255)
number = 255;
globals.linux_kernel_version += number << (24 - i * 8);
}
++i;
} else {
++p;
}
}
/* "Official" way of detecting WSL1 but not WSL2
* https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364
*
* WARNING: False negative detection of WSL1 will result in DATA LOSS!
* So, the REQUIREMENTS for this code:
* 1. MUST detect WSL1 without false-negatives.
* 2. DESIRABLE detect WSL2 but without the risk of violating the first. */
globals.running_on_WSL1 =
probe_for_WSL(buffer.version) == 1 || probe_for_WSL(buffer.sysname) == 1 || probe_for_WSL(buffer.release) == 1;
}
#endif /* Linux */
mdbx_init();
}
MDBX_EXCLUDE_FOR_GPROF
__cold static __attribute__((__destructor__)) void mdbx_global_destructor(void) {
mdbx_fini();
#ifdef ENABLE_GPROF
if (!&__gmon_start__)
_mcleanup();
#endif /* ENABLE_GPROF */
}
#endif /* ! Windows */
/******************************************************************************/
struct libmdbx_globals globals;
__cold static void mdbx_init(void) {
globals.runtime_flags = ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT;
globals.loglevel = MDBX_LOG_FATAL;
ENSURE(nullptr, osal_fastmutex_init(&globals.debug_lock) == 0);
osal_ctor();
assert(globals.sys_pagesize > 0 && (globals.sys_pagesize & (globals.sys_pagesize - 1)) == 0);
rthc_ctor();
#if MDBX_DEBUG
ENSURE(nullptr, troika_verify_fsm());
ENSURE(nullptr, pv2pages_verify());
#endif /* MDBX_DEBUG*/
}
MDBX_EXCLUDE_FOR_GPROF
__cold static void mdbx_fini(void) {
const uint32_t current_pid = osal_getpid();
TRACE(">> pid %d", current_pid);
rthc_dtor(current_pid);
osal_dtor();
TRACE("<< pid %d\n", current_pid);
ENSURE(nullptr, osal_fastmutex_destroy(&globals.debug_lock) == 0);
}
/******************************************************************************/
/* *INDENT-OFF* */
/* clang-format off */
__dll_export
#ifdef __attribute_used__
__attribute_used__
#elif defined(__GNUC__) || __has_attribute(__used__)
__attribute__((__used__))
#endif
#ifdef __attribute_externally_visible__
__attribute_externally_visible__
#elif (defined(__GNUC__) && !defined(__clang__)) || \
__has_attribute(__externally_visible__)
__attribute__((__externally_visible__))
#endif
const struct MDBX_build_info mdbx_build = {
#ifdef MDBX_BUILD_TIMESTAMP
MDBX_BUILD_TIMESTAMP
#else
"\"" __DATE__ " " __TIME__ "\""
#endif /* MDBX_BUILD_TIMESTAMP */
,
#ifdef MDBX_BUILD_TARGET
MDBX_BUILD_TARGET
#else
#if defined(__ANDROID_API__)
"Android" MDBX_STRINGIFY(__ANDROID_API__)
#elif defined(__linux__) || defined(__gnu_linux__)
"Linux"
#elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__)
"webassembly"
#elif defined(__CYGWIN__)
"CYGWIN"
#elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \
|| defined(__WINDOWS__)
"Windows"
#elif defined(__APPLE__)
#if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \
|| (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR)
"iOS"
#else
"MacOS"
#endif
#elif defined(__FreeBSD__)
"FreeBSD"
#elif defined(__DragonFly__)
"DragonFlyBSD"
#elif defined(__NetBSD__)
"NetBSD"
#elif defined(__OpenBSD__)
"OpenBSD"
#elif defined(__bsdi__)
"UnixBSDI"
#elif defined(__MACH__)
"MACH"
#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC))
"HPUX"
#elif defined(_AIX)
"AIX"
#elif defined(__sun) && defined(__SVR4)
"Solaris"
#elif defined(__BSD__) || defined(BSD)
"UnixBSD"
#elif defined(__unix__) || defined(UNIX) || defined(__unix) \
|| defined(__UNIX) || defined(__UNIX__)
"UNIX"
#elif defined(_POSIX_VERSION)
"POSIX" MDBX_STRINGIFY(_POSIX_VERSION)
#else
"UnknownOS"
#endif /* Target OS */
"-"
#if defined(__amd64__)
"AMD64"
#elif defined(__ia32__)
"IA32"
#elif defined(__e2k__) || defined(__elbrus__)
"Elbrus"
#elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
"Alpha"
#elif defined(__aarch64__) || defined(_M_ARM64)
"ARM64"
#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \
|| defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \
|| defined(_M_ARMT) || defined(__arm)
"ARM"
#elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64))
"MIPS64"
#elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__)
"MIPS"
#elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64)
"PARISC64"
#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
"PARISC"
#elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \
|| defined(__IA64__) || defined(_M_IA64) || defined(__itanium__)
"Itanium"
#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \
|| defined(__powerpc64) || defined(_ARCH_PPC64)
"PowerPC64"
#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \
|| defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__)
"PowerPC"
#elif defined(__sparc64__) || defined(__sparc64)
"SPARC64"
#elif defined(__sparc__) || defined(__sparc)
"SPARC"
#elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch)
"S390"
#else
"UnknownARCH"
#endif
#endif /* MDBX_BUILD_TARGET */
#ifdef MDBX_BUILD_TYPE
# if defined(_MSC_VER)
# pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE)
# endif
"-" MDBX_BUILD_TYPE
#endif /* MDBX_BUILD_TYPE */
,
"MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG)
#ifdef ENABLE_GPROF
" ENABLE_GPROF"
#endif /* ENABLE_GPROF */
" MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS)
" BYTE_ORDER="
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
"LITTLE_ENDIAN"
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
"BIG_ENDIAN"
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
" MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT)
" MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG
" MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
" MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG
" MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG
" MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG
" MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC)
" MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
" MDBX_USE_MINCORE=" MDBX_STRINGIFY(MDBX_USE_MINCORE)
" MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT)
" MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC)
#if MDBX_DISABLE_VALIDATION
" MDBX_DISABLE_VALIDATION=YES"
#endif /* MDBX_DISABLE_VALIDATION */
#ifdef __SANITIZE_ADDRESS__
" SANITIZE_ADDRESS=YES"
#endif /* __SANITIZE_ADDRESS__ */
#ifdef ENABLE_MEMCHECK
" ENABLE_MEMCHECK=YES"
#endif /* ENABLE_MEMCHECK */
#if MDBX_FORCE_ASSERTIONS
" MDBX_FORCE_ASSERTIONS=YES"
#endif /* MDBX_FORCE_ASSERTIONS */
#ifdef _GNU_SOURCE
" _GNU_SOURCE=YES"
#else
" _GNU_SOURCE=NO"
#endif /* _GNU_SOURCE */
#ifdef __APPLE__
" MDBX_APPLE_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_APPLE_SPEED_INSTEADOF_DURABILITY)
#endif /* MacOS */
#if defined(_WIN32) || defined(_WIN64)
" MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_WITHOUT_MSVC_CRT)
" MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY)
#if !MDBX_BUILD_SHARED_LIBRARY
" MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER)
#endif
" WINVER=" MDBX_STRINGIFY(WINVER)
#else /* Windows */
" MDBX_LOCKING=" MDBX_LOCKING_CONFIG
" MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG
#endif /* !Windows */
" MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE)
" MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT)
" MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE)
" MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE)
" MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK)
" MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING)
,
#ifdef MDBX_BUILD_COMPILER
MDBX_BUILD_COMPILER
#else
#ifdef __INTEL_COMPILER
"Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER)
#elif defined(__apple_build_version__)
"Apple clang " MDBX_STRINGIFY(__apple_build_version__)
#elif defined(__ibmxl__)
"IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__)
"." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__)
#elif defined(__clang__)
"clang " MDBX_STRINGIFY(__clang_version__)
#elif defined(__MINGW64__)
"MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION)
#elif defined(__MINGW32__)
"MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION)
#elif defined(__MINGW__)
"MINGW " MDBX_STRINGIFY(__MINGW_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW_MINOR_VERSION)
#elif defined(__IBMC__)
"IBM C " MDBX_STRINGIFY(__IBMC__)
#elif defined(__GNUC__)
"GNU C/C++ "
#ifdef __VERSION__
__VERSION__
#else
MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__)
#endif
#elif defined(_MSC_VER)
"MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD)
#else
"Unknown compiler"
#endif
#endif /* MDBX_BUILD_COMPILER */
,
#ifdef MDBX_BUILD_FLAGS_CONFIG
MDBX_BUILD_FLAGS_CONFIG
#endif /* MDBX_BUILD_FLAGS_CONFIG */
#if defined(MDBX_BUILD_FLAGS_CONFIG) && defined(MDBX_BUILD_FLAGS)
" "
#endif
#ifdef MDBX_BUILD_FLAGS
MDBX_BUILD_FLAGS
#endif /* MDBX_BUILD_FLAGS */
#if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS))
"undefined (please use correct build script)"
#ifdef _MSC_VER
#pragma message("warning: Build flags undefined. Please use correct build script")
#else
#warning "Build flags undefined. Please use correct build script"
#endif // _MSC_VER
#endif
, MDBX_BUILD_METADATA
};
#ifdef __SANITIZE_ADDRESS__
#if !defined(_MSC_VER) || __has_attribute(weak)
LIBMDBX_API __attribute__((__weak__))
#endif
const char *__asan_default_options(void) {
return "symbolize=1:allow_addr2line=1:"
#if MDBX_DEBUG
"debug=1:"
"verbosity=2:"
#endif /* MDBX_DEBUG */
"log_threads=1:"
"report_globals=1:"
"replace_str=1:replace_intrin=1:"
"malloc_context_size=9:"
#if !defined(__APPLE__)
"detect_leaks=1:"
#endif
"check_printf=1:"
"detect_deadlocks=1:"
#ifndef LTO_ENABLED
"check_initialization_order=1:"
#endif
"detect_stack_use_after_return=1:"
"intercept_tls_get_addr=1:"
"decorate_proc_maps=1:"
"abort_on_error=1";
}
#endif /* __SANITIZE_ADDRESS__ */
/* *INDENT-ON* */
/* clang-format on */

File diff suppressed because it is too large Load Diff

View File

@ -1,288 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
#pragma pack(push, 4)
/* A stamp that identifies a file as an MDBX file.
* There's nothing special about this value other than that it is easily
* recognizable, and it will reflect any byte order mismatches. */
#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_LEGACY_COMPAT ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
#define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
/* handle for the default DB. */
#define MAIN_DBI 1
/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
#define CORE_DBS 2
/* Number of meta pages - also hardcoded elsewhere */
#define NUM_METAS 3
/* A page number in the database.
*
* MDBX uses 32 bit for page numbers. This limits database
* size up to 2^44 bytes, in case of 4K pages. */
typedef uint32_t pgno_t;
typedef mdbx_atomic_uint32_t atomic_pgno_t;
#define PRIaPGNO PRIu32
#define MAX_PAGENO UINT32_C(0x7FFFffff)
#define MIN_PAGENO NUM_METAS
/* An invalid page number.
* Mainly used to denote an empty tree. */
#define P_INVALID (~(pgno_t)0)
/* A transaction ID. */
typedef uint64_t txnid_t;
typedef mdbx_atomic_uint64_t atomic_txnid_t;
#define PRIaTXN PRIi64
#define MIN_TXNID UINT64_C(1)
#define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
#define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
#define INVALID_TXNID UINT64_MAX
/* Used for offsets within a single page. */
typedef uint16_t indx_t;
typedef struct tree {
uint16_t flags; /* see mdbx_dbi_open */
uint16_t height; /* height of this tree */
uint32_t dupfix_size; /* key-size for MDBX_DUPFIXED (DUPFIX pages) */
pgno_t root; /* the root page of this tree */
pgno_t branch_pages; /* number of branch pages */
pgno_t leaf_pages; /* number of leaf pages */
pgno_t large_pages; /* number of large pages */
uint64_t sequence; /* table sequence counter */
uint64_t items; /* number of data items */
uint64_t mod_txnid; /* txnid of last committed modification */
} tree_t;
/* database size-related parameters */
typedef struct geo {
uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential
quantized) value */
uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
(exponential quantized) value */
pgno_t lower; /* minimal size of datafile in pages */
pgno_t upper; /* maximal size of datafile in pages */
union {
pgno_t now; /* current size of datafile in pages */
pgno_t end_pgno;
};
union {
pgno_t first_unallocated; /* first unused page in the datafile,
but actually the file may be shorter. */
pgno_t next_pgno;
};
} geo_t;
/* Meta page content.
* A meta page is the start point for accessing a database snapshot.
* Pages 0-2 are meta pages. */
typedef struct meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
uint32_t magic_and_version[2];
/* txnid that committed this meta, the first of a two-phase-update pair */
union {
mdbx_atomic_uint32_t txnid_a[2];
uint64_t unsafe_txnid;
};
uint16_t reserve16; /* extra flags, zero (nothing) for now */
uint8_t validator_id; /* ID of checksum and page validation method,
* zero (nothing) for now */
int8_t extra_pagehdr; /* extra bytes in the page header,
* zero (nothing) for now */
geo_t geometry; /* database size-related parameters */
union {
struct {
tree_t gc, main;
} trees;
__anonymous_struct_extension__ struct {
uint16_t gc_flags;
uint16_t gc_height;
uint32_t pagesize;
};
};
MDBX_canary canary;
#define DATASIGN_NONE 0u
#define DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > DATASIGN_WEAK)
union {
uint32_t sign[2];
uint64_t unsafe_sign;
};
/* txnid that committed this meta, the second of a two-phase-update pair */
mdbx_atomic_uint32_t txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with reader.snapshot_pages_retired allows fast
* estimation of "how much reader is restraining GC recycling". */
uint32_t pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
* If there was no reboot, but there is no need to rollback to the last
* steady sync point. Zeros mean that no relevant information is available
* from the system. */
bin128_t bootid;
/* GUID базы данных, начиная с v0.13.1 */
bin128_t dxbid;
} meta_t;
#pragma pack(1)
typedef enum page_type {
P_BRANCH = 0x01u /* branch page */,
P_LEAF = 0x02u /* leaf page */,
P_LARGE = 0x04u /* large/overflow page */,
P_META = 0x08u /* meta page */,
P_LEGACY_DIRTY = 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */,
P_BAD = P_LEGACY_DIRTY /* explicit flag for invalid/bad page */,
P_DUPFIX = 0x20u /* for MDBX_DUPFIXED records */,
P_SUBP = 0x40u /* for MDBX_DUPSORT sub-pages */,
P_SPILLED = 0x2000u /* spilled in parent txn */,
P_LOOSE = 0x4000u /* page was dirtied then freed, can be reused */,
P_FROZEN = 0x8000u /* used for retire page with known status */,
P_ILL_BITS = (uint16_t)~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE | P_SPILLED),
page_broken = 0,
page_large = P_LARGE,
page_branch = P_BRANCH,
page_leaf = P_LEAF,
page_dupfix_leaf = P_DUPFIX,
page_sub_leaf = P_SUBP | P_LEAF,
page_sub_dupfix_leaf = P_SUBP | P_DUPFIX,
page_sub_broken = P_SUBP,
} page_type_t;
/* Common header for all page types. The page type depends on flags.
*
* P_BRANCH and P_LEAF pages have unsorted 'node_t's at the end, with
* sorted entries[] entries referring to them. Exception: P_DUPFIX pages
* omit entries and pack sorted MDBX_DUPFIXED values after the page header.
*
* P_LARGE records occupy one or more contiguous pages where only the
* first has a page header. They hold the real data of N_BIG nodes.
*
* P_SUBP sub-pages are small leaf "pages" with duplicate data.
* A node with flag N_DUP but not N_TREE contains a sub-page.
* (Duplicate data can also go in tables, which use normal pages.)
*
* P_META pages contain meta_t, the start point of an MDBX snapshot.
*
* Each non-metapage up to meta_t.mm_last_pg is reachable exactly once
* in the snapshot: Either used by a database or listed in a GC record. */
typedef struct page {
uint64_t txnid; /* txnid which created page, maybe zero in legacy DB */
uint16_t dupfix_ksize; /* key size if this is a DUPFIX page */
uint16_t flags;
union {
uint32_t pages; /* number of overflow pages */
__anonymous_struct_extension__ struct {
indx_t lower; /* lower bound of free space */
indx_t upper; /* upper bound of free space */
};
};
pgno_t pgno; /* page number */
#if FLEXIBLE_ARRAY_MEMBERS
indx_t entries[] /* dynamic size */;
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} page_t;
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ 20u
/* Header for a single key/data pair within a page.
* Used in pages of type P_BRANCH and P_LEAF without P_DUPFIX.
* We guarantee 2-byte alignment for 'node_t's.
*
* Leaf node flags describe node contents. N_BIG says the node's
* data part is the page number of an overflow page with actual data.
* N_DUP and N_TREE can be combined giving duplicate data in
* a sub-page/table, and named databases (just N_TREE). */
typedef struct node {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
union {
uint32_t dsize;
uint32_t child_pgno;
};
uint8_t flags; /* see node_flags */
uint8_t extra;
uint16_t ksize; /* key size */
#else
uint16_t ksize; /* key size */
uint8_t extra;
uint8_t flags; /* see node_flags */
union {
uint32_t child_pgno;
uint32_t dsize;
};
#endif /* __BYTE_ORDER__ */
#if FLEXIBLE_ARRAY_MEMBERS
uint8_t payload[] /* key and data are appended here */;
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} node_t;
/* Size of the node header, excluding dynamic data at the end */
#define NODESIZE 8u
typedef enum node_flags {
N_BIG = 0x01 /* data put on large page */,
N_TREE = 0x02 /* data is a b-tree */,
N_DUP = 0x04 /* data has duplicates */
} node_flags_t;
#pragma pack(pop)
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t page_type(const page_t *mp) { return mp->flags; }
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t page_type_compat(const page_t *mp) {
/* Drop legacy P_DIRTY flag for sub-pages for compatilibity,
* for assertions only. */
return unlikely(mp->flags & P_SUBP) ? mp->flags & ~(P_SUBP | P_LEGACY_DIRTY) : mp->flags;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_leaf(const page_t *mp) {
return (mp->flags & P_LEAF) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_dupfix_leaf(const page_t *mp) {
return (mp->flags & P_DUPFIX) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_branch(const page_t *mp) {
return (mp->flags & P_BRANCH) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_largepage(const page_t *mp) {
return (mp->flags & P_LARGE) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool is_subpage(const page_t *mp) {
return (mp->flags & P_SUBP) != 0;
}

View File

@ -1,289 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 6
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
#define MDBX_LCK_SIGN UINT32_C(0xF10C)
typedef void osal_ipclock_t;
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
#define MDBX_LCK_SIGN UINT32_C(0xF18D)
typedef mdbx_pid_t osal_ipclock_t;
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || MDBX_LOCKING == MDBX_LOCKING_POSIX2008
#define MDBX_LCK_SIGN UINT32_C(0x8017)
typedef pthread_mutex_t osal_ipclock_t;
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
#define MDBX_LCK_SIGN UINT32_C(0xFC29)
typedef sem_t osal_ipclock_t;
#else
#error "FIXME"
#endif /* MDBX_LOCKING */
/* Статистика профилирования работы GC */
typedef struct gc_prof_stat {
/* Монотонное время по "настенным часам"
* затраченное на чтение и поиск внутри GC */
uint64_t rtime_monotonic;
/* Процессорное время в режим пользователя
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
uint64_t xtime_cpu;
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
uint32_t rsteps;
/* Количество запросов на выделение последовательностей страниц,
* т.е. когда запрашивает выделение больше одной страницы */
uint32_t xpages;
/* Счетчик выполнения по медленному пути (slow path execution count) */
uint32_t spe_counter;
/* page faults (hard page faults) */
uint32_t majflt;
/* Для разборок с pnl_merge() */
struct {
uint64_t time;
uint64_t volume;
uint32_t calls;
} pnl_merge;
} gc_prof_stat_t;
/* Statistics of pages operations for all transactions,
* including incomplete and aborted. */
typedef struct pgops {
mdbx_atomic_uint64_t newly; /* Quantity of a new pages added */
mdbx_atomic_uint64_t cow; /* Quantity of pages copied for update */
mdbx_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
for nested transactions */
mdbx_atomic_uint64_t split; /* Page splits */
mdbx_atomic_uint64_t merge; /* Page merges */
mdbx_atomic_uint64_t spill; /* Quantity of spilled dirty pages */
mdbx_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
mdbx_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */
mdbx_atomic_uint64_t msync; /* Number of explicit msync/flush-to-disk operations */
mdbx_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */
mdbx_atomic_uint64_t prefault; /* Number of prefault write operations */
mdbx_atomic_uint64_t mincore; /* Number of mincore() calls */
mdbx_atomic_uint32_t incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269
caught */
mdbx_atomic_uint32_t reserved;
/* Статистика для профилирования GC.
* Логически эти данные, возможно, стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */
struct {
/* Затраты на поддержку данных пользователя */
gc_prof_stat_t work;
/* Затраты на поддержку и обновления самой GC */
gc_prof_stat_t self;
/* Итераций обновления GC,
* больше 1 если были повторы/перезапуски */
uint32_t wloops;
/* Итерации слияния записей GC */
uint32_t coalescences;
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
uint32_t wipes;
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
uint32_t flushes;
/* Попытки пнуть тормозящих читателей */
uint32_t kicks;
} gc_prof;
} pgop_stat_t;
/* Reader Lock Table
*
* Readers don't acquire any locks for their data access. Instead, they
* simply record their transaction ID in the reader table. The reader
* mutex is needed just to find an empty slot in the reader table. The
* slot's address is saved in thread-specific data so that subsequent
* read transactions started by the same thread need no further locking to
* proceed.
*
* If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in
* thread-specific data. No reader table is used if the database is on a
* read-only filesystem.
*
* Since the database uses multi-version concurrency control, readers don't
* actually need any locking. This table is used to keep track of which
* readers are using data from which old transactions, so that we'll know
* when a particular old transaction is no longer in use. Old transactions
* that have discarded any data pages can then have those pages reclaimed
* for use by a later write transaction.
*
* The lock table is constructed such that reader slots are aligned with the
* processor's cache line size. Any slot is only ever used by one thread.
* This alignment guarantees that there will be no contention or cache
* thrashing as threads update their own slot info, and also eliminates
* any need for locking when accessing a slot.
*
* A writer thread will scan every slot in the table to determine the oldest
* outstanding reader transaction. Any freed pages older than this will be
* reclaimed by the writer. The writer doesn't use any locks when scanning
* this table. This means that there's no guarantee that the writer will
* see the most up-to-date reader info, but that's not required for correct
* operation - all we need is to know the upper bound on the oldest reader,
* we don't care at all about the newest reader. So the only consequence of
* reading stale information here is that old pages might hang around a
* while longer before being reclaimed. That's actually good anyway, because
* the longer we delay reclaiming old pages, the more likely it is that a
* string of contiguous pages can be found after coalescing old pages from
* many old transactions together. */
/* The actual reader record, with cacheline padding. */
typedef struct reader_slot {
/* Current Transaction ID when this transaction began, or INVALID_TXNID.
* Multiple readers that start at the same time will probably have the
* same ID here. Again, it's not important to exclude them from
* anything; all we need to know is which version of the DB they
* started from so we can avoid overwriting any data used in that
* particular version. */
atomic_txnid_t txnid;
/* The information we store in a single slot of the reader table.
* In addition to a transaction ID, we also record the process and
* thread ID that owns a slot, so that we can detect stale information,
* e.g. threads or processes that went away without cleaning up.
*
* NOTE: We currently don't check for stale records.
* We simply re-init the table when we know that we're the only process
* opening the lock file. */
/* Псевдо thread_id для пометки вытесненных читающих транзакций. */
#define MDBX_TID_TXN_OUSTED (UINT64_MAX - 1)
/* Псевдо thread_id для пометки припаркованных читающих транзакций. */
#define MDBX_TID_TXN_PARKED UINT64_MAX
/* The thread ID of the thread owning this txn. */
mdbx_atomic_uint64_t tid;
/* The process ID of the process owning this reader txn. */
mdbx_atomic_uint32_t pid;
/* The number of pages used in the reader's MVCC snapshot,
* i.e. the value of meta->geometry.first_unallocated and
* txn->geo.first_unallocated */
atomic_pgno_t snapshot_pages_used;
/* Number of retired pages at the time this reader starts transaction. So,
* at any time the difference meta.pages_retired -
* reader.snapshot_pages_retired will give the number of pages which this
* reader restraining from reuse. */
mdbx_atomic_uint64_t snapshot_pages_retired;
} reader_slot_t;
/* The header for the reader table (a memory-mapped lock file). */
typedef struct shared_lck {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_LOCK_VERSION. */
uint64_t magic_and_version;
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
uint32_t os_and_format;
/* Flags which environment was opened. */
mdbx_atomic_uint32_t envmode;
/* Threshold of un-synced-with-disk pages for auto-sync feature,
* zero means no-threshold, i.e. auto-sync is disabled. */
atomic_pgno_t autosync_threshold;
/* Low 32-bit of txnid with which meta-pages was synced,
* i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3)
#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8)
#define MDBX_NOMETASYNC_LAZY_WRITEMAP (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8)
mdbx_atomic_uint32_t meta_sync_txnid;
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
* the mti_unsynced_timeout sets to the current_time + autosync_period.
* The time value is represented in a suitable system-dependent form, for
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
* Zero means timed auto-sync is disabled. */
mdbx_atomic_uint64_t autosync_period;
/* Marker to distinguish uniqueness of DB/CLK. */
mdbx_atomic_uint64_t bait_uniqueness;
/* Paired counter of processes that have mlock()ed part of mmapped DB.
* The (mlcnt[0] - mlcnt[1]) > 0 means at least one process
* lock at least one page, so therefore madvise() could return EINVAL. */
mdbx_atomic_uint32_t mlcnt[2];
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
/* Statistics of costly ops of all (running, completed and aborted)
* transactions */
pgop_stat_t pgops;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_LOCKING > 0
/* Write transaction lock. */
osal_ipclock_t wrt_lock;
#endif /* MDBX_LOCKING > 0 */
atomic_txnid_t cached_oldest;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
mdbx_atomic_uint64_t eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
mdbx_atomic_uint64_t unsynced_pages;
/* Timestamp of the last readers check. */
mdbx_atomic_uint64_t readers_check_timestamp;
/* Number of page which was discarded last time by madvise(DONTNEED). */
atomic_pgno_t discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t readahead_anchor;
/* Shared cache for mincore() results */
struct {
pgno_t begin[4];
uint64_t mask[4];
} mincore_cache;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_LOCKING > 0
/* Readeaders table lock. */
osal_ipclock_t rdt_lock;
#endif /* MDBX_LOCKING > 0 */
/* The number of slots that have been used in the reader table.
* This always records the maximum count, it is not decremented
* when readers release their slots. */
mdbx_atomic_uint32_t rdt_length;
mdbx_atomic_uint32_t rdt_refresh_flag;
#if FLEXIBLE_ARRAY_MEMBERS
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
reader_slot_t rdt[] /* dynamic size */;
/* Lockfile format signature: version, features and field layout */
#define MDBX_LOCK_FORMAT \
(MDBX_LCK_SIGN * 27733 + (unsigned)sizeof(reader_slot_t) * 13 + \
(unsigned)offsetof(reader_slot_t, snapshot_pages_used) * 251 + (unsigned)offsetof(lck_t, cached_oldest) * 83 + \
(unsigned)offsetof(lck_t, rdt_length) * 37 + (unsigned)offsetof(lck_t, rdt) * 29)
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} lck_t;
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
#define MDBX_READERS_LIMIT 32767

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

174
src/lck.c
View File

@ -1,174 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
__cold static int lck_setup_locked(MDBX_env *env) {
int err = rthc_register(env);
if (unlikely(err != MDBX_SUCCESS))
return err;
int lck_seize_rc = lck_seize(env);
if (unlikely(MDBX_IS_ERROR(lck_seize_rc)))
return lck_seize_rc;
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
env->lck = lckless_stub(env);
env->max_readers = UINT_MAX;
DEBUG("lck-setup:%s%s%s", " lck-less", (env->flags & MDBX_RDONLY) ? " readonly" : "",
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
return lck_seize_rc;
}
DEBUG("lck-setup:%s%s%s", " with-lck", (env->flags & MDBX_RDONLY) ? " readonly" : "",
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
MDBX_env *inprocess_neighbor = nullptr;
err = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor);
if (unlikely(MDBX_IS_ERROR(err)))
return err;
if (inprocess_neighbor) {
if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || (inprocess_neighbor->flags & MDBX_EXCLUSIVE) != 0)
return MDBX_BUSY;
if (lck_seize_rc == MDBX_RESULT_TRUE) {
err = lck_downgrade(env);
if (unlikely(err != MDBX_SUCCESS))
return err;
lck_seize_rc = MDBX_RESULT_FALSE;
}
}
uint64_t size = 0;
err = osal_filesize(env->lck_mmap.fd, &size);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (lck_seize_rc == MDBX_RESULT_TRUE) {
size = ceil_powerof2(env->max_readers * sizeof(reader_slot_t) + sizeof(lck_t), globals.sys_pagesize);
jitter4testing(false);
} else {
if (env->flags & MDBX_EXCLUSIVE)
return MDBX_BUSY;
if (size > INT_MAX || (size & (globals.sys_pagesize - 1)) != 0 || size < globals.sys_pagesize) {
ERROR("lck-file has invalid size %" PRIu64 " bytes", size);
return MDBX_PROBLEM;
}
}
const size_t maxreaders = ((size_t)size - sizeof(lck_t)) / sizeof(reader_slot_t);
if (maxreaders < 4) {
ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders);
return MDBX_PROBLEM;
}
env->max_readers = (maxreaders <= MDBX_READERS_LIMIT) ? (unsigned)maxreaders : (unsigned)MDBX_READERS_LIMIT;
err =
osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap, (size_t)size, (size_t)size,
lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE, env->pathname.lck);
if (unlikely(err != MDBX_SUCCESS))
return err;
#ifdef MADV_DODUMP
err = madvise(env->lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys_and_eagain(errno) : MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#endif /* MADV_DODUMP */
#ifdef MADV_WILLNEED
err = madvise(env->lck_mmap.lck, size, MADV_WILLNEED) ? ignore_enosys_and_eagain(errno) : MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#elif defined(POSIX_MADV_WILLNEED)
err = ignore_enosys(posix_madvise(env->lck_mmap.lck, size, POSIX_MADV_WILLNEED));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#endif /* MADV_WILLNEED */
lck_t *lck = env->lck_mmap.lck;
if (lck_seize_rc == MDBX_RESULT_TRUE) {
/* If we succeed got exclusive lock, then nobody is using the lock region
* and we should initialize it. */
memset(lck, 0, (size_t)size);
jitter4testing(false);
lck->magic_and_version = MDBX_LOCK_MAGIC;
lck->os_and_format = MDBX_LOCK_FORMAT;
#if MDBX_ENABLE_PGOP_STAT
lck->pgops.wops.weak = 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
err = osal_msync(&env->lck_mmap, 0, (size_t)size, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
if (unlikely(err != MDBX_SUCCESS)) {
ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err);
eASSERT(env, MDBX_IS_ERROR(err));
return err;
}
} else {
if (lck->magic_and_version != MDBX_LOCK_MAGIC) {
const bool invalid = (lck->magic_and_version >> 8) != MDBX_MAGIC;
ERROR("lock region has %s", invalid ? "invalid magic"
: "incompatible version (only applications with nearly or the "
"same versions of libmdbx can share the same database)");
return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH;
}
if (lck->os_and_format != MDBX_LOCK_FORMAT) {
ERROR("lock region has os/format signature 0x%" PRIx32 ", expected 0x%" PRIx32, lck->os_and_format,
MDBX_LOCK_FORMAT);
return MDBX_VERSION_MISMATCH;
}
}
err = lck_init(env, inprocess_neighbor, lck_seize_rc);
if (unlikely(err != MDBX_SUCCESS)) {
eASSERT(env, MDBX_IS_ERROR(err));
return err;
}
env->lck = lck;
eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc));
return lck_seize_rc;
}
__cold int lck_setup(MDBX_env *env, mdbx_mode_t mode) {
eASSERT(env, env->lazy_fd != INVALID_HANDLE_VALUE);
eASSERT(env, env->lck_mmap.fd == INVALID_HANDLE_VALUE);
int err = osal_openfile(MDBX_OPEN_LCK, env, env->pathname.lck, &env->lck_mmap.fd, mode);
if (err != MDBX_SUCCESS) {
switch (err) {
case MDBX_EACCESS:
case MDBX_EPERM:
if (F_ISSET(env->flags, MDBX_RDONLY | MDBX_EXCLUSIVE))
break;
__fallthrough /* fall through */;
case MDBX_ENOFILE:
case MDBX_EROFS:
if (env->flags & MDBX_RDONLY) {
/* ENSURE the file system is read-only */
int err_rofs = osal_check_fs_rdonly(env->lazy_fd, env->pathname.lck, err);
if (err_rofs == MDBX_SUCCESS ||
/* ignore ERROR_NOT_SUPPORTED for exclusive mode */
(err_rofs == MDBX_ENOSYS && (env->flags & MDBX_EXCLUSIVE)))
break;
if (err_rofs != MDBX_ENOSYS)
err = err_rofs;
}
__fallthrough /* fall through */;
default:
ERROR("unable to open lck-file %" MDBX_PRIsPATH ", env-flags 0x%X, err %d", env->pathname.lck, env->flags, err);
return err;
}
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
NOTICE("continue %" MDBX_PRIsPATH " within without-lck mode, env-flags 0x%X, lck-error %d", env->pathname.dxb,
env->flags, err);
}
rthc_lock();
err = lck_setup_locked(env);
rthc_unlock();
return err;
}
void mincore_clean_cache(const MDBX_env *const env) {
memset(env->lck->mincore_cache.begin, -1, sizeof(env->lck->mincore_cache.begin));
}

110
src/lck.h
View File

@ -1,110 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
MDBX_INTERNAL int lck_setup(MDBX_env *env, mdbx_mode_t mode);
#if MDBX_LOCKING > MDBX_LOCKING_SYSV
MDBX_INTERNAL int lck_ipclock_stubinit(osal_ipclock_t *ipc);
MDBX_INTERNAL int lck_ipclock_destroy(osal_ipclock_t *ipc);
#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */
/// \brief Initialization of synchronization primitives linked with MDBX_env
/// instance both in LCK-file and within the current process.
/// \param
/// global_uniqueness_flag = true - denotes that there are no other processes
/// working with DB and LCK-file. Thus the function MUST initialize
/// shared synchronization objects in memory-mapped LCK-file.
/// global_uniqueness_flag = false - denotes that at least one process is
/// already working with DB and LCK-file, including the case when DB
/// has already been opened in the current process. Thus the function
/// MUST NOT initialize shared synchronization objects in memory-mapped
/// LCK-file that are already in use.
/// \return Error code or zero on success.
MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag);
/// \brief Disconnects from shared interprocess objects and destructs
/// synchronization objects linked with MDBX_env instance
/// within the current process.
/// \param
/// inprocess_neighbor = nullptr - if the current process does not have other
/// instances of MDBX_env linked with the DB being closed.
/// Thus the function MUST check for other processes working with DB or
/// LCK-file, and keep or destroy shared synchronization objects in
/// memory-mapped LCK-file depending on the result.
/// inprocess_neighbor = not-nullptr - pointer to another instance of MDBX_env
/// (anyone of there is several) working with DB or LCK-file within the
/// current process. Thus the function MUST NOT try to acquire exclusive
/// lock and/or try to destruct shared synchronization objects linked with
/// DB or LCK-file. Moreover, the implementation MUST ensure correct work
/// of other instances of MDBX_env within the current process, e.g.
/// restore POSIX-fcntl locks after the closing of file descriptors.
/// \return Error code (MDBX_PANIC) or zero on success.
MDBX_INTERNAL int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor, const uint32_t current_pid);
/// \brief Connects to shared interprocess locking objects and tries to acquire
/// the maximum lock level (shared if exclusive is not available)
/// Depending on implementation or/and platform (Windows) this function may
/// acquire the non-OS super-level lock (e.g. for shared synchronization
/// objects initialization), which will be downgraded to OS-exclusive or
/// shared via explicit calling of lck_downgrade().
/// \return
/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
/// the current process is the first and only after the last use of DB.
/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
/// DB has already been opened and now is used by other processes.
/// Otherwise (not 0 and not -1) - error code.
MDBX_INTERNAL int lck_seize(MDBX_env *env);
/// \brief Downgrades the level of initially acquired lock to
/// operational level specified by argument. The reason for such downgrade:
/// - unblocking of other processes that are waiting for access, i.e.
/// if (env->flags & MDBX_EXCLUSIVE) != 0, then other processes
/// should be made aware that access is unavailable rather than
/// wait for it.
/// - freeing locks that interfere file operation (especially for Windows)
/// (env->flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
/// (env->flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
/// operational lock.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_downgrade(MDBX_env *env);
MDBX_MAYBE_UNUSED MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait);
/// \brief Locks LCK-file or/and table of readers for (de)registering.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env);
/// \brief Unlocks LCK-file or/and table of readers after (de)registering.
MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env);
/// \brief Acquires write-transaction lock.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_txn_lock(MDBX_env *env, bool dont_wait);
/// \brief Releases write-transaction lock..
MDBX_INTERNAL void lck_txn_unlock(MDBX_env *env);
/// \brief Sets alive-flag of reader presence (indicative lock) for PID of
/// the current process. The function does no more than needed for
/// the correct working of lck_rpid_check() in other processes.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_rpid_set(MDBX_env *env);
/// \brief Resets alive-flag of reader presence (indicative lock)
/// for PID of the current process. The function does no more than needed
/// for the correct working of lck_rpid_check() in other processes.
/// \return Error code or zero on success
MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env);
/// \brief Checks for reading process status with the given pid with help of
/// alive-flag of presence (indicative lock) or using another way.
/// \return
/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
/// and working with DB (indicative lock is present).
/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
/// or not working with DB (indicative lock is not present).
/// Otherwise (not 0 and not -1) - error code.
MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid);

View File

@ -1,250 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#include "internals.h"
__cold void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args) {
ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0);
if (globals.logger.ptr) {
if (globals.logger_buffer == nullptr)
globals.logger.fmt(level, function, line, fmt, args);
else {
const int len = vsnprintf(globals.logger_buffer, globals.logger_buffer_size, fmt, args);
if (len > 0)
globals.logger.nofmt(level, function, line, globals.logger_buffer, len);
}
} else {
#if defined(_WIN32) || defined(_WIN64)
if (IsDebuggerPresent()) {
int prefix_len = 0;
char *prefix = nullptr;
if (function && line > 0)
prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line);
else if (function)
prefix_len = osal_asprintf(&prefix, "%s: ", function);
else if (line > 0)
prefix_len = osal_asprintf(&prefix, "%d: ", line);
if (prefix_len > 0 && prefix) {
OutputDebugStringA(prefix);
osal_free(prefix);
}
char *msg = nullptr;
int msg_len = osal_vasprintf(&msg, fmt, args);
if (msg_len > 0 && msg) {
OutputDebugStringA(msg);
osal_free(msg);
}
}
#else
if (function && line > 0)
fprintf(stderr, "%s:%d ", function, line);
else if (function)
fprintf(stderr, "%s: ", function);
else if (line > 0)
fprintf(stderr, "%d: ", line);
vfprintf(stderr, fmt, args);
fflush(stderr);
#endif
}
ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0);
}
__cold void debug_log(int level, const char *function, int line, const char *fmt, ...) {
va_list args;
va_start(args, fmt);
debug_log_va(level, function, line, fmt, args);
va_end(args);
}
__cold void log_error(const int err, const char *func, unsigned line) {
assert(err != MDBX_SUCCESS);
if (unlikely(globals.loglevel >= MDBX_LOG_DEBUG)) {
const bool is_error = err != MDBX_RESULT_TRUE && err != MDBX_NOTFOUND;
char buf[256];
debug_log(is_error ? MDBX_LOG_ERROR : MDBX_LOG_VERBOSE, func, line, "%s %d (%s)\n",
is_error ? "error" : "condition", err, mdbx_strerror_r(err, buf, sizeof(buf)));
}
}
/* Dump a val in ascii or hexadecimal. */
__cold const char *mdbx_dump_val(const MDBX_val *val, char *const buf, const size_t bufsize) {
if (!val)
return "<null>";
if (!val->iov_len)
return "<empty>";
if (!buf || bufsize < 4)
return nullptr;
if (!val->iov_base) {
int len = snprintf(buf, bufsize, "<nullptr.%zu>", val->iov_len);
assert(len > 0 && (size_t)len < bufsize);
(void)len;
return buf;
}
bool is_ascii = true;
const uint8_t *const data = val->iov_base;
for (size_t i = 0; i < val->iov_len; i++)
if (data[i] < ' ' || data[i] > '~') {
is_ascii = false;
break;
}
if (is_ascii) {
int len = snprintf(buf, bufsize, "%.*s", (val->iov_len > INT_MAX) ? INT_MAX : (int)val->iov_len, data);
assert(len > 0 && (size_t)len < bufsize);
(void)len;
} else {
char *const detent = buf + bufsize - 2;
char *ptr = buf;
*ptr++ = '<';
for (size_t i = 0; i < val->iov_len && ptr < detent; i++) {
const char hex[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
*ptr++ = hex[data[i] >> 4];
*ptr++ = hex[data[i] & 15];
}
if (ptr < detent)
*ptr++ = '>';
*ptr = '\0';
}
return buf;
}
/*------------------------------------------------------------------------------
LY: debug stuff */
__cold const char *pagetype_caption(const uint8_t type, char buf4unknown[16]) {
switch (type) {
case P_BRANCH:
return "branch";
case P_LEAF:
return "leaf";
case P_LEAF | P_SUBP:
return "subleaf";
case P_LEAF | P_DUPFIX:
return "dupfix-leaf";
case P_LEAF | P_DUPFIX | P_SUBP:
return "dupfix-subleaf";
case P_LEAF | P_DUPFIX | P_SUBP | P_LEGACY_DIRTY:
return "dupfix-subleaf.legacy-dirty";
case P_LARGE:
return "large";
default:
snprintf(buf4unknown, 16, "unknown_0x%x", type);
return buf4unknown;
}
}
__cold static const char *leafnode_type(node_t *n) {
static const char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}};
return (node_flags(n) & N_BIG) ? ": large page" : tp[!!(node_flags(n) & N_DUP)][!!(node_flags(n) & N_TREE)];
}
/* Display all the keys in the page. */
__cold void page_list(page_t *mp) {
pgno_t pgno = mp->pgno;
const char *type;
node_t *node;
size_t i, nkeys, nsize, total = 0;
MDBX_val key;
DKBUF;
switch (page_type(mp)) {
case P_BRANCH:
type = "Branch page";
break;
case P_LEAF:
type = "Leaf page";
break;
case P_LEAF | P_SUBP:
type = "Leaf sub-page";
break;
case P_LEAF | P_DUPFIX:
type = "Leaf2 page";
break;
case P_LEAF | P_DUPFIX | P_SUBP:
type = "Leaf2 sub-page";
break;
case P_LARGE:
VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->pages);
return;
case P_META:
VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, unaligned_peek_u64(4, page_meta(mp)->txnid_a));
return;
default:
VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->flags);
return;
}
nkeys = page_numkeys(mp);
VERBOSE("%s %" PRIaPGNO " numkeys %zu\n", type, pgno, nkeys);
for (i = 0; i < nkeys; i++) {
if (is_dupfix_leaf(mp)) { /* DUPFIX pages have no entries[] or node headers */
key = page_dupfix_key(mp, i, nsize = mp->dupfix_ksize);
total += nsize;
VERBOSE("key %zu: nsize %zu, %s\n", i, nsize, DKEY(&key));
continue;
}
node = page_node(mp, i);
key.iov_len = node_ks(node);
key.iov_base = node->payload;
nsize = NODESIZE + key.iov_len;
if (is_branch(mp)) {
VERBOSE("key %zu: page %" PRIaPGNO ", %s\n", i, node_pgno(node), DKEY(&key));
total += nsize;
} else {
if (node_flags(node) & N_BIG)
nsize += sizeof(pgno_t);
else
nsize += node_ds(node);
total += nsize;
nsize += sizeof(indx_t);
VERBOSE("key %zu: nsize %zu, %s%s\n", i, nsize, DKEY(&key), leafnode_type(node));
}
total = EVEN_CEIL(total);
}
VERBOSE("Total: header %u + contents %zu + unused %zu\n", is_dupfix_leaf(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->lower,
total, page_room(mp));
}
__cold static int setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, union logger_union logger, char *buffer,
size_t buffer_size) {
ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0);
const int rc = globals.runtime_flags | (globals.loglevel << 16);
if (level != MDBX_LOG_DONTCHANGE)
globals.loglevel = (uint8_t)level;
if (flags != MDBX_DBG_DONTCHANGE) {
flags &=
#if MDBX_DEBUG
MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER |
#endif
MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE;
globals.runtime_flags = (uint8_t)flags;
}
assert(MDBX_LOGGER_DONTCHANGE == ((MDBX_debug_func *)(intptr_t)-1));
if (logger.ptr != (void *)((intptr_t)-1)) {
globals.logger.ptr = logger.ptr;
globals.logger_buffer = buffer;
globals.logger_buffer_size = buffer_size;
}
ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0);
return rc;
}
__cold int mdbx_setup_debug_nofmt(MDBX_log_level_t level, MDBX_debug_flags_t flags, MDBX_debug_func_nofmt *logger,
char *buffer, size_t buffer_size) {
union logger_union thunk;
thunk.nofmt = (logger && buffer && buffer_size) ? logger : MDBX_LOGGER_NOFMT_DONTCHANGE;
return setup_debug(level, flags, thunk, buffer, buffer_size);
}
__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, MDBX_debug_func *logger) {
union logger_union thunk;
thunk.fmt = logger;
return setup_debug(level, flags, thunk, nullptr, 0);
}

View File

@ -1,159 +0,0 @@
/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
#pragma once
#include "essentials.h"
#ifndef __Wpedantic_format_voidptr
MDBX_MAYBE_UNUSED static inline const void *__Wpedantic_format_voidptr(const void *ptr) { return ptr; }
#define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
#endif /* __Wpedantic_format_voidptr */
MDBX_INTERNAL void MDBX_PRINTF_ARGS(4, 5) debug_log(int level, const char *function, int line, const char *fmt, ...)
MDBX_PRINTF_ARGS(4, 5);
MDBX_INTERNAL void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args);
#if MDBX_DEBUG
#define LOG_ENABLED(LVL) unlikely(LVL <= globals.loglevel)
#define AUDIT_ENABLED() unlikely((globals.runtime_flags & (unsigned)MDBX_DBG_AUDIT))
#else /* MDBX_DEBUG */
#define LOG_ENABLED(LVL) (LVL < MDBX_LOG_VERBOSE && LVL <= globals.loglevel)
#define AUDIT_ENABLED() (0)
#endif /* LOG_ENABLED() & AUDIT_ENABLED() */
#if MDBX_FORCE_ASSERTIONS
#define ASSERT_ENABLED() (1)
#elif MDBX_DEBUG
#define ASSERT_ENABLED() likely((globals.runtime_flags & (unsigned)MDBX_DBG_ASSERT))
#else
#define ASSERT_ENABLED() (0)
#endif /* ASSERT_ENABLED() */
#define DEBUG_EXTRA(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_EXTRA)) \
debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \
} while (0)
#define DEBUG_EXTRA_PRINT(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_EXTRA)) \
debug_log(MDBX_LOG_EXTRA, nullptr, 0, fmt, __VA_ARGS__); \
} while (0)
#define TRACE(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_TRACE)) \
debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define DEBUG(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_DEBUG)) \
debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define VERBOSE(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \
debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define NOTICE(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_NOTICE)) \
debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define WARNING(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_WARN)) \
debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#undef ERROR /* wingdi.h \
Yeah, morons from M$ put such definition to the public header. */
#define ERROR(fmt, ...) \
do { \
if (LOG_ENABLED(MDBX_LOG_ERROR)) \
debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \
} while (0)
#define FATAL(fmt, ...) debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
#if MDBX_DEBUG
#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line)
#else /* MDBX_DEBUG */
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, unsigned line);
#define ASSERT_FAIL(env, msg, func, line) \
do { \
(void)(env); \
assert_fail(msg, func, line); \
} while (0)
#endif /* MDBX_DEBUG */
#define ENSURE_MSG(env, expr, msg) \
do { \
if (unlikely(!(expr))) \
ASSERT_FAIL(env, msg, __func__, __LINE__); \
} while (0)
#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
/* assert(3) variant in environment context */
#define eASSERT(env, expr) \
do { \
if (ASSERT_ENABLED()) \
ENSURE(env, expr); \
} while (0)
/* assert(3) variant in cursor context */
#define cASSERT(mc, expr) eASSERT((mc)->txn->env, expr)
/* assert(3) variant in transaction context */
#define tASSERT(txn, expr) eASSERT((txn)->env, expr)
#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */
#undef assert
#define assert(expr) eASSERT(nullptr, expr)
#endif
MDBX_MAYBE_UNUSED static inline void jitter4testing(bool tiny) {
#if MDBX_DEBUG
if (globals.runtime_flags & (unsigned)MDBX_DBG_JITTER)
osal_jitter(tiny);
#else
(void)tiny;
#endif
}
MDBX_MAYBE_UNUSED MDBX_INTERNAL void page_list(page_t *mp);
MDBX_INTERNAL const char *pagetype_caption(const uint8_t type, char buf4unknown[16]);
/* Key size which fits in a DKBUF (debug key buffer). */
#define DKBUF_MAX 127
#define DKBUF char dbg_kbuf[DKBUF_MAX * 4 + 2]
#define DKEY(x) mdbx_dump_val(x, dbg_kbuf, DKBUF_MAX * 2 + 1)
#define DVAL(x) mdbx_dump_val(x, dbg_kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1)
#if MDBX_DEBUG
#define DKBUF_DEBUG DKBUF
#define DKEY_DEBUG(x) DKEY(x)
#define DVAL_DEBUG(x) DVAL(x)
#else
#define DKBUF_DEBUG ((void)(0))
#define DKEY_DEBUG(x) ("-")
#define DVAL_DEBUG(x) ("-")
#endif
MDBX_INTERNAL void log_error(const int err, const char *func, unsigned line);
MDBX_MAYBE_UNUSED static inline int log_if_error(const int err, const char *func, unsigned line) {
if (unlikely(err != MDBX_SUCCESS))
log_error(err, func, line);
return err;
}
#define LOG_IFERR(err) log_if_error((err), __func__, __LINE__)

View File

@ -1,6 +1,6 @@
.\" Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE. .\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_CHK 1 "2025-01-14" "MDBX 0.14" .TH MDBX_CHK 1 "2020-06-05" "MDBX 0.8.x"
.SH NAME .SH NAME
mdbx_chk \- MDBX checking tool mdbx_chk \- MDBX checking tool
.SH SYNOPSIS .SH SYNOPSIS
@ -22,12 +22,12 @@ mdbx_chk \- MDBX checking tool
[\c [\c
.BR \-i ] .BR \-i ]
[\c [\c
.BI \-s \ table\fR] .BI \-s \ subdb\fR]
.BR \ dbpath .BR \ dbpath
.SH DESCRIPTION .SH DESCRIPTION
The The
.B mdbx_chk .B mdbx_chk
utility is intended to check an MDBX database file. utility intended to check an MDBX database file.
.SH OPTIONS .SH OPTIONS
.TP .TP
.BR \-V .BR \-V
@ -44,6 +44,9 @@ during B-tree traversal and basic info of each GC record.
If \fB\-vvvvv\fP is given, turn maximal verbosity, display the full list If \fB\-vvvvv\fP is given, turn maximal verbosity, display the full list
of page IDs in the GC records and size of each key-value pair of database(s). of page IDs in the GC records and size of each key-value pair of database(s).
.TP .TP
.BR \-n
Open MDBX environment(s) which do not use subdirectories.
.TP
.BR \-q .BR \-q
Be quiet; do not output anything even if an error was detected. Be quiet; do not output anything even if an error was detected.
.TP .TP
@ -55,7 +58,7 @@ check, including full check of all meta-pages and actual size of database file.
.BR \-w .BR \-w
Open environment in read-write mode and lock for writing while checking. Open environment in read-write mode and lock for writing while checking.
This could be impossible if environment already used by another process(s) This could be impossible if environment already used by another process(s)
in an incompatible read-write mode. This allows rollback to last steady commit in an incompatible read-write mode. This allow rollback to last steady commit
(in case environment was not closed properly) and then check transaction IDs (in case environment was not closed properly) and then check transaction IDs
of meta-pages. Otherwise, without \fB\-w\fP option environment will be of meta-pages. Otherwise, without \fB\-w\fP option environment will be
opened in read-only mode. opened in read-only mode.
@ -69,29 +72,8 @@ pages.
Ignore wrong order errors, which will likely false-positive if custom Ignore wrong order errors, which will likely false-positive if custom
comparator(s) was used. comparator(s) was used.
.TP .TP
.BR \-s \ table .BR \-s \ subdb
Verify and show info only for a specific table. Verify and show info only for a specific subdatabase.
.TP
.BR \-0 | \-1 | \-2
Using specific meta-page 0, or 2 for checking.
.TP
.BR \-t
Turn to a specified meta-page on successful check.
.TP
.BR \-T
Turn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!
.TP
.BR \-u
Warms up the DB before checking via notifying OS kernel of subsequent access to the database pages.
.TP
.BR \-U
Warms up the DB before checking, notifying the OS kernel of subsequent access to the database pages,
then forcibly loads ones by sequential access and tries to lock database pages in memory.
.TP
.BR \-n
Open MDBX environment(s) which do not use subdirectories.
This is a legacy option. For now MDBX handles this automatically.
.SH DIAGNOSTICS .SH DIAGNOSTICS
Exit status is zero if no errors occur. Errors result in a non-zero exit status Exit status is zero if no errors occur. Errors result in a non-zero exit status
and a diagnostic message being written to standard error and a diagnostic message being written to standard error
@ -101,6 +83,5 @@ if no quiet mode was requested.
.BR mdbx_copy (1), .BR mdbx_copy (1),
.BR mdbx_dump (1), .BR mdbx_dump (1),
.BR mdbx_load (1) .BR mdbx_load (1)
.BR mdbx_drop (1)
.SH AUTHOR .SH AUTHOR
Leonid Yuriev <https://gitflic.ru/user/erthink> Leonid Yuriev <https://github.com/erthink>

View File

@ -1,8 +1,8 @@
.\" Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE. .\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_COPY 1 "2025-01-14" "MDBX 0.14" .TH MDBX_COPY 1 "2020-06-05" "MDBX 0.8.x"
.SH NAME .SH NAME
mdbx_copy \- MDBX environment copy tool mdbx_copy \- MDBX environment copy tool
.SH SYNOPSIS .SH SYNOPSIS
@ -14,10 +14,6 @@ mdbx_copy \- MDBX environment copy tool
[\c [\c
.BR \-c ] .BR \-c ]
[\c [\c
.BR \-d ]
[\c
.BR \-p ]
[\c
.BR \-n ] .BR \-n ]
.B src_path .B src_path
[\c [\c
@ -49,32 +45,8 @@ or unused pages will be omitted from the copy. This option will
slow down the backup process as it is more CPU-intensive. slow down the backup process as it is more CPU-intensive.
Currently it fails if the environment has suffered a page leak. Currently it fails if the environment has suffered a page leak.
.TP .TP
.BR \-d
Alters geometry to enforce the copy to be a dynamic size DB,
which could be growth and shrink by reasonable steps on the fly.
.TP
.BR \-p
Use read transaction parking/ousting during copying MVCC-snapshot.
This allows the writing transaction to oust the read
transaction used to copy the database if copying takes so long
that it will interfere with the recycling old MVCC snapshots
and may lead to an overflow of the database.
However, if the reading transaction is ousted the copy will
be aborted until successful completion. Thus, this option
allows copy the database without interfering with write
transactions and a threat of database overflow, but at the cost
that copying will be aborted to prevent such conditions.
.TP
.BR \-u
Warms up the DB before copying via notifying OS kernel of subsequent access to the database pages.
.TP
.BR \-U
Warms up the DB before copying, notifying the OS kernel of subsequent access to the database pages,
then forcibly loads ones by sequential access and tries to lock database pages in memory.
.TP
.BR \-n .BR \-n
Open MDBX environment(s) which do not use subdirectories. Open MDBX environment(s) which do not use subdirectories.
This is legacy option. For now MDBX handles this automatically.
.SH DIAGNOSTICS .SH DIAGNOSTICS
Exit status is zero if no errors occur. Exit status is zero if no errors occur.
@ -89,7 +61,6 @@ free during copying cannot be reused until the copy is done.
.BR mdbx_chk (1), .BR mdbx_chk (1),
.BR mdbx_stat (1), .BR mdbx_stat (1),
.BR mdbx_load (1) .BR mdbx_load (1)
.BR mdbx_drop (1)
.SH AUTHOR .SH AUTHOR
Howard Chu of Symas Corporation <http://www.symas.com>, Howard Chu of Symas Corporation <http://www.symas.com>,
Leonid Yuriev <https://gitflic.ru/user/erthink> Leonid Yuriev <https://github.com/erthink>

View File

@ -1,48 +0,0 @@
.\" Copyright 2021-2025 Leonid Yuriev <leo@yuriev.ru>.
.\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_DROP 1 "2025-01-14" "MDBX 0.14"
.SH NAME
mdbx_drop \- MDBX database delete tool
.SH SYNOPSIS
.B mdbx_drop
[\c
.BR \-V ]
[\c
.BR \-d ]
[\c
.BI \-s \ table\fR]
[\c
.BR \-n ]
.BR \ dbpath
.SH DESCRIPTION
The
.B mdbx_drop
utility empties or deletes a database in the specified
environment.
.SH OPTIONS
.TP
.BR \-V
Write the library version number to the standard output, and exit.
.TP
.BR \-d
Delete the specified database, don't just empty it.
.TP
.BR \-s \ table
Operate on a specific table. If no table is specified, only the main table is dropped.
.TP
.BR \-n
Dump an MDBX database which does not use subdirectories.
This is legacy option. For now MDBX handles this automatically.
.SH DIAGNOSTICS
Exit status is zero if no errors occur.
Errors result in a non-zero exit status and
a diagnostic message being written to standard error.
.SH "SEE ALSO"
.BR mdbx_load (1),
.BR mdbx_copy (1),
.BR mdbx_chk (1),
.BR mdbx_stat (1)
.SH AUTHOR
Howard Chu of Symas Corporation <http://www.symas.com>

View File

@ -1,8 +1,8 @@
.\" Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE. .\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_DUMP 1 "2025-01-14" "MDBX 0.14" .TH MDBX_DUMP 1 "2020-06-05" "MDBX 0.8.x"
.SH NAME .SH NAME
mdbx_dump \- MDBX environment export tool mdbx_dump \- MDBX environment export tool
.SH SYNOPSIS .SH SYNOPSIS
@ -12,8 +12,6 @@ mdbx_dump \- MDBX environment export tool
[\c [\c
.BR \-q ] .BR \-q ]
[\c [\c
.BR \-c ]
[\c
.BI \-f \ file\fR] .BI \-f \ file\fR]
[\c [\c
.BR \-l ] .BR \-l ]
@ -21,7 +19,7 @@ mdbx_dump \- MDBX environment export tool
.BR \-p ] .BR \-p ]
[\c [\c
.BR \-a \ | .BR \-a \ |
.BI \-s \ table\fR] .BI \-s \ subdb\fR]
[\c [\c
.BR \-r ] .BR \-r ]
[\c [\c
@ -43,9 +41,6 @@ Write the library version number to the standard output, and exit.
.BR \-q .BR \-q
Be quiet. Be quiet.
.TP .TP
.BR \-c
Concise mode without repeating keys in a dump, but incompatible with Berkeley DB and LMDB.
.TP
.BR \-f \ file .BR \-f \ file
Write to the specified file instead of to the standard output. Write to the specified file instead of to the standard output.
.TP .TP
@ -63,25 +58,16 @@ are considered printing characters, and databases dumped in this manner may
be less portable to external systems. be less portable to external systems.
.TP .TP
.BR \-a .BR \-a
Dump all of the tables in the environment. Dump all of the subdatabases in the environment.
.TP .TP
.BR \-s \ table .BR \-s \ subdb
Dump a specific table. If no database is specified, only the main table is dumped. Dump a specific subdatabase. If no database is specified, only the main database is dumped.
.TP .TP
.BR \-r .BR \-r
Rescure mode. Ignore some errors to dump corrupted DB. Rescure mode. Ignore some errors to dump corrupted DB.
.TP .TP
.BR \-u
Warms up the DB before dumping via notifying OS kernel of subsequent access to the database pages.
.TP
.BR \-U
Warms up the DB before dumping, notifying the OS kernel of subsequent access to the database pages,
then forcibly loads ones by sequential access and tries to lock database pages in memory.
.TP
.BR \-n .BR \-n
Dump an MDBX database which does not use subdirectories. Dump an MDBX database which does not use subdirectories.
This is legacy option. For now MDBX handles this automatically.
.SH DIAGNOSTICS .SH DIAGNOSTICS
Exit status is zero if no errors occur. Exit status is zero if no errors occur.
Errors result in a non-zero exit status and Errors result in a non-zero exit status and
@ -100,7 +86,6 @@ utility to load the database using the correct comparison functions.
.BR mdbx_copy (1), .BR mdbx_copy (1),
.BR mdbx_chk (1), .BR mdbx_chk (1),
.BR mdbx_stat (1) .BR mdbx_stat (1)
.BR mdbx_drop (1)
.SH AUTHOR .SH AUTHOR
Howard Chu of Symas Corporation <http://www.symas.com>, Howard Chu of Symas Corporation <http://www.symas.com>,
Leonid Yuriev <https://gitflic.ru/user/erthink> Leonid Yuriev <https://github.com/erthink>

View File

@ -1,8 +1,8 @@
.\" Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE. .\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_LOAD 1 "2025-01-14" "MDBX 0.14" .TH MDBX_LOAD 1 "2020-06-05" "MDBX 0.8.x"
.SH NAME .SH NAME
mdbx_load \- MDBX environment import tool mdbx_load \- MDBX environment import tool
.SH SYNOPSIS .SH SYNOPSIS
@ -16,7 +16,7 @@ mdbx_load \- MDBX environment import tool
[\c [\c
.BI \-f \ file\fR] .BI \-f \ file\fR]
[\c [\c
.BI \-s \ table\fR] .BI \-s \ subdb\fR]
[\c [\c
.BR \-N ] .BR \-N ]
[\c [\c
@ -40,19 +40,6 @@ must be in the output format specified by the
utility or as specified by the utility or as specified by the
.B -T .B -T
option below. option below.
A simple escape mechanism, where newline and backslash (\\) characters are special, is
applied to the text input. Newline characters are interpreted as record separators.
Backslash characters in the text will be interpreted in one of two ways: If the backslash
character precedes another backslash character, the pair will be interpreted as a literal
backslash. If the backslash character precedes any other character, the two characters
following the backslash will be interpreted as a hexadecimal specification of a single
character; for example, \\0a is a newline character in the ASCII character set.
For this reason, any backslash or newline characters that naturally occur in the text
input must be escaped to avoid misinterpretation by
.BR mdbx_load .
.SH OPTIONS .SH OPTIONS
.TP .TP
.BR \-V .BR \-V
@ -71,11 +58,11 @@ on a database that uses custom compare functions.
.BR \-f \ file .BR \-f \ file
Read from the specified file instead of from the standard input. Read from the specified file instead of from the standard input.
.TP .TP
.BR \-s \ table .BR \-s \ subdb
Load a specific table. If no table is specified, data is loaded into the main table. Load a specific subdatabase. If no database is specified, data is loaded into the main database.
.TP .TP
.BR \-N .BR \-N
Don't overwrite existing records when loading into an already existing table; just skip them. Don't overwrite existing records when loading into an already existing database; just skip them.
.TP .TP
.BR \-T .BR \-T
Load data from simple text files. The input must be paired lines of text, where the first Load data from simple text files. The input must be paired lines of text, where the first
@ -87,7 +74,18 @@ Rescure mode. Ignore errors to load corrupted DB dump.
.TP .TP
.BR \-n .BR \-n
Load an MDBX database which does not use subdirectories. Load an MDBX database which does not use subdirectories.
This is legacy option. For now MDBX handles this automatically.
A simple escape mechanism, where newline and backslash (\\) characters are special, is
applied to the text input. Newline characters are interpreted as record separators.
Backslash characters in the text will be interpreted in one of two ways: If the backslash
character precedes another backslash character, the pair will be interpreted as a literal
backslash. If the backslash character precedes any other character, the two characters
following the backslash will be interpreted as a hexadecimal specification of a single
character; for example, \\0a is a newline character in the ASCII character set.
For this reason, any backslash or newline characters that naturally occur in the text
input must be escaped to avoid misinterpretation by
.BR mdbx_load .
.SH DIAGNOSTICS .SH DIAGNOSTICS
Exit status is zero if no errors occur. Exit status is zero if no errors occur.
@ -99,7 +97,6 @@ a diagnostic message being written to standard error.
.BR mdbx_chk (1), .BR mdbx_chk (1),
.BR mdbx_stat (1), .BR mdbx_stat (1),
.BR mdbx_copy (1) .BR mdbx_copy (1)
.BR mdbx_drop (1)
.SH AUTHOR .SH AUTHOR
Howard Chu of Symas Corporation <http://www.symas.com>, Howard Chu of Symas Corporation <http://www.symas.com>,
Leonid Yuriev <https://gitflic.ru/user/erthink> Leonid Yuriev <https://github.com/erthink>

View File

@ -1,8 +1,8 @@
.\" Copyright 2015-2025 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved.
.\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>.
.\" Copying restrictions apply. See COPYRIGHT/LICENSE. .\" Copying restrictions apply. See COPYRIGHT/LICENSE.
.TH MDBX_STAT 1 "2025-01-14" "MDBX 0.14" .TH MDBX_STAT 1 "2020-06-05" "MDBX 0.8.x"
.SH NAME .SH NAME
mdbx_stat \- MDBX environment status tool mdbx_stat \- MDBX environment status tool
.SH SYNOPSIS .SH SYNOPSIS
@ -12,8 +12,6 @@ mdbx_stat \- MDBX environment status tool
[\c [\c
.BR \-q ] .BR \-q ]
[\c [\c
.BR \-p ]
[\c
.BR \-e ] .BR \-e ]
[\c [\c
.BR \-f [ f [ f ]]] .BR \-f [ f [ f ]]]
@ -21,7 +19,7 @@ mdbx_stat \- MDBX environment status tool
.BR \-r [ r ]] .BR \-r [ r ]]
[\c [\c
.BR \-a \ | .BR \-a \ |
.BI \-s \ table\fR] .BI \-s \ subdb\fR]
.BR \ dbpath .BR \ dbpath
[\c [\c
.BR \-n ] .BR \-n ]
@ -37,18 +35,13 @@ Write the library version number to the standard output, and exit.
.BR \-q .BR \-q
Be quiet. Be quiet.
.TP .TP
.BR \-p
Display overall statistics of page operations of all (running, completed
and aborted) transactions in the current multi-process session (since the
first process opened the database after everyone had previously closed it).
.TP
.BR \-e .BR \-e
Display information about the database environment. Display information about the database environment.
.TP .TP
.BR \-f .BR \-f
Display information about the environment GC. Display information about the environment freelist.
If \fB\-ff\fP is given, summarize each GC/freelist entry. If \fB\-ff\fP is given, summarize each freelist entry.
If \fB\-fff\fP is given, display the full list of page IDs in the GC/freelist. If \fB\-fff\fP is given, display the full list of page IDs in the freelist.
.TP .TP
.BR \-r .BR \-r
Display information about the environment reader table. Display information about the environment reader table.
@ -61,16 +54,13 @@ table and clear them. The reader table will be printed again
after the check is performed. after the check is performed.
.TP .TP
.BR \-a .BR \-a
Display the status of all of the tables in the environment. Display the status of all of the subdatabases in the environment.
.TP .TP
.BR \-s \ table .BR \-s \ subdb
Display the status of a specific table. Display the status of a specific subdatabase.
.TP .TP
.BR \-n .BR \-n
Display the status of an MDBX database which does not use subdirectories. Display the status of an MDBX database which does not use subdirectories.
This is legacy option. For now MDBX handles this automatically
for existing databases, but may be required while creating new.
.SH DIAGNOSTICS .SH DIAGNOSTICS
Exit status is zero if no errors occur. Exit status is zero if no errors occur.
Errors result in a non-zero exit status and Errors result in a non-zero exit status and
@ -80,7 +70,6 @@ a diagnostic message being written to standard error.
.BR mdbx_copy (1), .BR mdbx_copy (1),
.BR mdbx_dump (1), .BR mdbx_dump (1),
.BR mdbx_load (1) .BR mdbx_load (1)
.BR mdbx_drop (1)
.SH AUTHOR .SH AUTHOR
Howard Chu of Symas Corporation <http://www.symas.com>, Howard Chu of Symas Corporation <http://www.symas.com>,
Leonid Yuriev <https://gitflic.ru/user/erthink> Leonid Yuriev <https://github.com/erthink>

File diff suppressed because it is too large Load Diff

1485
src/mdbx_chk.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,17 @@
/// \copyright SPDX-License-Identifier: Apache-2.0 /* mdbx_copy.c - memory-mapped database backup tool */
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments. /*
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025 * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>
/// * and other libmdbx authors: please see AUTHORS file.
/// mdbx_copy.c - memory-mapped database backup tool * All rights reserved.
/// *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#ifdef _MSC_VER #ifdef _MSC_VER
#if _MSC_VER > 1800 #if _MSC_VER > 1800
@ -13,8 +20,8 @@
#pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */
#endif /* _MSC_VER (warnings) */ #endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #define MDBX_TOOLS /* Avoid using internal mdbx_assert() */
#include "essentials.h" #include "internals.h"
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
#include "wingetopt.h" #include "wingetopt.h"
@ -38,63 +45,33 @@ static void signal_handler(int sig) {
static void usage(const char *prog) { static void usage(const char *prog) {
fprintf(stderr, fprintf(stderr,
"usage: %s [-V] [-q] [-c] [-d] [-p] [-u|U] src_path [dest_path]\n" "usage: %s [-V] [-q] [-c] src_path [dest_path]\n"
" -V\t\tprint version and exit\n" " -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n" " -q\t\tbe quiet\n"
" -c\t\tenable compactification (skip unused pages)\n" " -c\t\tenable compactification (skip unused pages)\n"
" -d\t\tenforce copy to be a dynamic size DB\n"
" -p\t\tusing transaction parking/ousting during copying MVCC-snapshot\n"
" \t\tto avoid stopping recycling and overflowing the DB\n"
" -u\t\twarmup database before copying\n"
" -U\t\twarmup and try lock database pages in memory before copying\n"
" src_path\tsource database\n" " src_path\tsource database\n"
" dest_path\tdestination (stdout if not specified)\n", " dest_path\tdestination (stdout if not specified)\n",
prog); prog);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
static void logger(MDBX_log_level_t level, const char *function, int line, const char *fmt, va_list args) {
static const char *const prefixes[] = {
"!!!fatal: ", // 0 fatal
" ! ", // 1 error
" ~ ", // 2 warning
" ", // 3 notice
" //", // 4 verbose
};
if (level < MDBX_LOG_DEBUG) {
if (function && line)
fprintf(stderr, "%s", prefixes[level]);
vfprintf(stderr, fmt, args);
}
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
int rc; int rc;
MDBX_env *env = nullptr; MDBX_env *env = NULL;
const char *progname = argv[0], *act; const char *progname = argv[0], *act;
unsigned flags = MDBX_RDONLY; unsigned flags = MDBX_RDONLY;
unsigned cpflags = 0; unsigned cpflags = 0;
bool quiet = false; bool quiet = false;
bool warmup = false;
MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default;
for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) {
if (argv[1][1] == 'n' && argv[1][2] == '\0') if (argv[1][1] == 'n' && argv[1][2] == '\0')
flags |= MDBX_NOSUBDIR; flags |= MDBX_NOSUBDIR;
else if (argv[1][1] == 'c' && argv[1][2] == '\0') else if (argv[1][1] == 'c' && argv[1][2] == '\0')
cpflags |= MDBX_CP_COMPACT; cpflags |= MDBX_CP_COMPACT;
else if (argv[1][1] == 'd' && argv[1][2] == '\0')
cpflags |= MDBX_CP_FORCE_DYNAMIC_SIZE;
else if (argv[1][1] == 'p' && argv[1][2] == '\0')
cpflags |= MDBX_CP_THROTTLE_MVCC;
else if (argv[1][1] == 'q' && argv[1][2] == '\0') else if (argv[1][1] == 'q' && argv[1][2] == '\0')
quiet = true; quiet = true;
else if (argv[1][1] == 'u' && argv[1][2] == '\0') else if ((argv[1][1] == 'h' && argv[1][2] == '\0') ||
warmup = true; strcmp(argv[1], "--help") == 0)
else if (argv[1][1] == 'U' && argv[1][2] == '\0') {
warmup = true;
warmup_flags = MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock;
} else if ((argv[1][1] == 'h' && argv[1][2] == '\0') || strcmp(argv[1], "--help") == 0)
usage(progname); usage(progname);
else if (argv[1][1] == 'V' && argv[1][2] == '\0') { else if (argv[1][1] == 'V' && argv[1][2] == '\0') {
printf("mdbx_copy version %d.%d.%d.%d\n" printf("mdbx_copy version %d.%d.%d.%d\n"
@ -103,9 +80,12 @@ int main(int argc, char *argv[]) {
" - build: %s for %s by %s\n" " - build: %s for %s by %s\n"
" - flags: %s\n" " - flags: %s\n"
" - options: %s\n", " - options: %s\n",
mdbx_version.major, mdbx_version.minor, mdbx_version.patch, mdbx_version.tweak, mdbx_version.git.describe, mdbx_version.major, mdbx_version.minor, mdbx_version.release,
mdbx_version.git.datetime, mdbx_version.git.commit, mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_version.revision, mdbx_version.git.describe,
mdbx_build.datetime, mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, mdbx_build.options); mdbx_version.git.datetime, mdbx_version.git.commit,
mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime,
mdbx_build.target, mdbx_build.compiler, mdbx_build.flags,
mdbx_build.options);
return EXIT_SUCCESS; return EXIT_SUCCESS;
} else } else
argc = 0; argc = 0;
@ -128,24 +108,19 @@ int main(int argc, char *argv[]) {
#endif /* !WINDOWS */ #endif /* !WINDOWS */
if (!quiet) { if (!quiet) {
fprintf((argc == 2) ? stderr : stdout, "mdbx_copy %s (%s, T-%s)\nRunning for copy %s to %s...\n", fprintf((argc == 2) ? stderr : stdout,
mdbx_version.git.describe, mdbx_version.git.datetime, mdbx_version.git.tree, argv[1], "mdbx_copy %s (%s, T-%s)\nRunning for copy %s to %s...\n",
(argc == 2) ? "stdout" : argv[2]); mdbx_version.git.describe, mdbx_version.git.datetime,
fflush(nullptr); mdbx_version.git.tree, argv[1], (argc == 2) ? "stdout" : argv[2]);
mdbx_setup_debug(MDBX_LOG_NOTICE, MDBX_DBG_DONTCHANGE, logger); fflush(NULL);
} }
act = "opening environment"; act = "opening environment";
rc = mdbx_env_create(&env); rc = mdbx_env_create(&env);
if (rc == MDBX_SUCCESS) if (rc == MDBX_SUCCESS) {
rc = mdbx_env_open(env, argv[1], flags, 0); rc = mdbx_env_open(env, argv[1], flags, 0640);
if (rc == MDBX_SUCCESS && warmup) {
act = "warming up";
rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536);
} }
if (rc == MDBX_SUCCESS) {
if (!MDBX_IS_ERROR(rc)) {
act = "copying"; act = "copying";
if (argc == 2) { if (argc == 2) {
mdbx_filehandle_t fd; mdbx_filehandle_t fd;
@ -159,7 +134,8 @@ int main(int argc, char *argv[]) {
rc = mdbx_env_copy(env, argv[2], cpflags); rc = mdbx_env_copy(env, argv[2], cpflags);
} }
if (rc) if (rc)
fprintf(stderr, "%s: %s failed, error %d (%s)\n", progname, act, rc, mdbx_strerror(rc)); fprintf(stderr, "%s: %s failed, error %d (%s)\n", progname, act, rc,
mdbx_strerror(rc));
mdbx_env_close(env); mdbx_env_close(env);
return rc ? EXIT_FAILURE : EXIT_SUCCESS; return rc ? EXIT_FAILURE : EXIT_SUCCESS;

View File

@ -1,10 +1,17 @@
/// \copyright SPDX-License-Identifier: Apache-2.0 /* mdbx_dump.c - memory-mapped database dump tool */
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments. /*
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025 * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>
/// * and other libmdbx authors: please see AUTHORS file.
/// mdbx_dump.c - memory-mapped database dump tool * All rights reserved.
/// *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
#ifdef _MSC_VER #ifdef _MSC_VER
#if _MSC_VER > 1800 #if _MSC_VER > 1800
@ -13,14 +20,13 @@
#pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */
#endif /* _MSC_VER (warnings) */ #endif /* _MSC_VER (warnings) */
#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #define MDBX_TOOLS /* Avoid using internal mdbx_assert() */
#include "essentials.h" #include "internals.h"
#include <ctype.h> #include <ctype.h>
#define PRINT 1 #define PRINT 1
#define GLOBAL 2 #define GLOBAL 2
#define CONCISE 4
static int mode = GLOBAL; static int mode = GLOBAL;
typedef struct flagbit { typedef struct flagbit {
@ -31,7 +37,7 @@ typedef struct flagbit {
flagbit dbflags[] = {{MDBX_REVERSEKEY, "reversekey"}, flagbit dbflags[] = {{MDBX_REVERSEKEY, "reversekey"},
{MDBX_DUPSORT, "dupsort"}, {MDBX_DUPSORT, "dupsort"},
{MDBX_INTEGERKEY, "integerkey"}, {MDBX_INTEGERKEY, "integerkey"},
{MDBX_DUPFIXED, "dupfix"}, {MDBX_DUPFIXED, "dupfixed"},
{MDBX_INTEGERDUP, "integerdup"}, {MDBX_INTEGERDUP, "integerdup"},
{MDBX_REVERSEDUP, "reversedup"}, {MDBX_REVERSEDUP, "reversedup"},
{0, nullptr}}; {0, nullptr}};
@ -56,33 +62,51 @@ static void signal_handler(int sig) {
#endif /* !WINDOWS */ #endif /* !WINDOWS */
static void dumpval(const MDBX_val *v) { static const char hexc[] = "0123456789abcdef";
static const char digits[] = "0123456789abcdef";
static void dumpbyte(unsigned char c) {
putchar(hexc[c >> 4]);
putchar(hexc[c & 0xf]);
}
static void text(MDBX_val *v) {
unsigned char *c, *end;
putchar(' '); putchar(' ');
for (const unsigned char *c = v->iov_base, *end = c + v->iov_len; c < end; ++c) { c = v->iov_base;
if (mode & PRINT) { end = c + v->iov_len;
if (isprint(*c) && *c != '\\') { while (c < end) {
putchar(*c); if (isprint(*c) && *c != '\\') {
continue; putchar(*c);
} else } else {
putchar('\\'); putchar('\\');
dumpbyte(*c);
} }
putchar(digits[*c >> 4]); c++;
putchar(digits[*c & 15]);
} }
putchar('\n'); putchar('\n');
} }
static void dumpval(MDBX_val *v) {
unsigned char *c, *end;
putchar(' ');
c = v->iov_base;
end = c + v->iov_len;
while (c < end)
dumpbyte(*c++);
putchar('\n');
}
bool quiet = false, rescue = false; bool quiet = false, rescue = false;
const char *prog; const char *prog;
static void error(const char *func, int rc) { static void error(const char *func, int rc) {
if (!quiet) fprintf(stderr, "%s: %s() error %d %s\n", prog, func, rc, mdbx_strerror(rc));
fprintf(stderr, "%s: %s() error %d %s\n", prog, func, rc, mdbx_strerror(rc));
} }
/* Dump in BDB-compatible format */ /* Dump in BDB-compatible format */
static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) { static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
unsigned flags; unsigned int flags;
int rc = mdbx_dbi_flags(txn, dbi, &flags); int rc = mdbx_dbi_flags(txn, dbi, &flags);
if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_dbi_flags", rc); error("mdbx_dbi_flags", rc);
@ -107,19 +131,22 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
if (mode & GLOBAL) { if (mode & GLOBAL) {
mode -= GLOBAL; mode -= GLOBAL;
if (info.mi_geo.upper != info.mi_geo.lower) if (info.mi_geo.upper != info.mi_geo.lower)
printf("geometry=l%" PRIu64 ",c%" PRIu64 ",u%" PRIu64 ",s%" PRIu64 ",g%" PRIu64 "\n", info.mi_geo.lower, printf("geometry=l%" PRIu64 ",c%" PRIu64 ",u%" PRIu64 ",s%" PRIu64
info.mi_geo.current, info.mi_geo.upper, info.mi_geo.shrink, info.mi_geo.grow); ",g%" PRIu64 "\n",
info.mi_geo.lower, info.mi_geo.current, info.mi_geo.upper,
info.mi_geo.shrink, info.mi_geo.grow);
printf("mapsize=%" PRIu64 "\n", info.mi_geo.upper); printf("mapsize=%" PRIu64 "\n", info.mi_geo.upper);
printf("maxreaders=%u\n", info.mi_maxreaders); printf("maxreaders=%u\n", info.mi_maxreaders);
MDBX_canary canary; mdbx_canary canary;
rc = mdbx_canary_get(txn, &canary); rc = mdbx_canary_get(txn, &canary);
if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_canary_get", rc); error("mdbx_canary_get", rc);
return rc; return rc;
} }
if (canary.v) if (canary.v)
printf("canary=v%" PRIu64 ",x%" PRIu64 ",y%" PRIu64 ",z%" PRIu64 "\n", canary.v, canary.x, canary.y, canary.z); printf("canary=v%" PRIu64 ",x%" PRIu64 ",y%" PRIu64 ",z%" PRIu64 "\n",
canary.v, canary.x, canary.y, canary.z);
} }
printf("format=%s\n", mode & PRINT ? "print" : "bytevalue"); printf("format=%s\n", mode & PRINT ? "print" : "bytevalue");
if (name) if (name)
@ -131,7 +158,10 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
else if (!name) else if (!name)
printf("txnid=%" PRIaTXN "\n", mdbx_txn_id(txn)); */ printf("txnid=%" PRIaTXN "\n", mdbx_txn_id(txn)); */
printf("duplicates=%d\n", (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) ? 1 : 0); printf("duplicates=%d\n", (flags & (MDBX_DUPSORT | MDBX_DUPFIXED |
MDBX_INTEGERDUP | MDBX_REVERSEDUP))
? 1
: 0);
for (int i = 0; dbflags[i].bit; i++) for (int i = 0; dbflags[i].bit; i++)
if (flags & dbflags[i].bit) if (flags & dbflags[i].bit)
printf("%s=1\n", dbflags[i].name); printf("%s=1\n", dbflags[i].name);
@ -154,32 +184,24 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
error("mdbx_cursor_open", rc); error("mdbx_cursor_open", rc);
return rc; return rc;
} }
if (rescue) { if (MDBX_DEBUG > 0 && rescue) {
rc = mdbx_cursor_ignord(cursor); cursor->mc_flags |= C_SKIPORD;
if (unlikely(rc != MDBX_SUCCESS)) { if (cursor->mc_xcursor)
error("mdbx_cursor_ignord", rc); cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD;
return rc;
}
} }
while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == MDBX_SUCCESS) { while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) ==
MDBX_SUCCESS) {
if (user_break) { if (user_break) {
rc = MDBX_EINTR; rc = MDBX_EINTR;
break; break;
} }
dumpval(&key); if (mode & PRINT) {
dumpval(&data); text(&key);
if ((flags & MDBX_DUPSORT) && (mode & CONCISE)) { text(&data);
while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT_DUP)) == MDBX_SUCCESS) { } else {
if (user_break) { dumpval(&key);
rc = MDBX_EINTR; dumpval(&data);
break;
}
putchar(' ');
dumpval(&data);
}
if (rc != MDBX_NOTFOUND)
break;
} }
} }
printf("DATA=END\n"); printf("DATA=END\n");
@ -187,80 +209,48 @@ static int dump_tbl(MDBX_txn *txn, MDBX_dbi dbi, char *name) {
rc = MDBX_SUCCESS; rc = MDBX_SUCCESS;
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
error("mdbx_cursor_get", rc); error("mdbx_cursor_get", rc);
mdbx_cursor_close(cursor);
return rc; return rc;
} }
static void usage(void) { static void usage(void) {
fprintf(stderr, fprintf(stderr,
"usage: %s " "usage: %s [-V] [-q] [-f file] [-l] [-p] [-a|-s subdb] [-r] "
"[-V] [-q] [-c] [-f file] [-l] [-p] [-r] [-a|-s table] [-u|U] "
"dbpath\n" "dbpath\n"
" -V\t\tprint version and exit\n" " -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n" " -q\t\tbe quiet\n"
" -c\t\tconcise mode without repeating keys,\n"
" \t\tbut incompatible with Berkeley DB and LMDB\n"
" -f\t\twrite to file instead of stdout\n" " -f\t\twrite to file instead of stdout\n"
" -l\t\tlist tables and exit\n" " -l\t\tlist subDBs and exit\n"
" -p\t\tuse printable characters\n" " -p\t\tuse printable characters\n"
" -r\t\trescue mode (ignore errors to dump corrupted DB)\n" " -a\t\tdump main DB and all subDBs,\n"
" -a\t\tdump main DB and all tables\n" " \t\tby default dump only the main DB\n"
" -s name\tdump only the specified named table\n" " -s\t\tdump only the named subDB\n"
" -u\t\twarmup database before dumping\n" " -r\t\trescure mode (ignore errors to dump corrupted DB)\n",
" -U\t\twarmup and try lock database pages in memory before dumping\n"
" \t\tby default dump only the main DB\n",
prog); prog);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
static void logger(MDBX_log_level_t level, const char *function, int line, const char *fmt, va_list args) {
static const char *const prefixes[] = {
"!!!fatal: ", // 0 fatal
" ! ", // 1 error
" ~ ", // 2 warning
" ", // 3 notice
" //", // 4 verbose
};
if (level < MDBX_LOG_DEBUG) {
if (function && line)
fprintf(stderr, "%s", prefixes[level]);
vfprintf(stderr, fmt, args);
}
}
static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) {
return (a->iov_len == b->iov_len && memcmp(a->iov_base, b->iov_base, a->iov_len) == 0) ? 0 : 1; return (a->iov_len == b->iov_len &&
memcmp(a->iov_base, b->iov_base, a->iov_len) == 0)
? 0
: 1;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
int i, err; int i, rc;
MDBX_env *env; MDBX_env *env;
MDBX_txn *txn; MDBX_txn *txn;
MDBX_dbi dbi; MDBX_dbi dbi;
prog = argv[0]; prog = argv[0];
char *envname; char *envname;
char *subname = nullptr, *buf4free = nullptr; char *subname = nullptr;
unsigned envflags = 0; unsigned envflags = 0;
bool alldbs = false, list = false; bool alldbs = false, list = false;
bool warmup = false;
MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default;
if (argc < 2) if (argc < 2)
usage(); usage();
while ((i = getopt(argc, argv, while ((i = getopt(argc, argv, "af:lnps:Vrq")) != EOF) {
"uU"
"a"
"f:"
"l"
"n"
"p"
"s:"
"V"
"r"
"c"
"q")) != EOF) {
switch (i) { switch (i) {
case 'V': case 'V':
printf("mdbx_dump version %d.%d.%d.%d\n" printf("mdbx_dump version %d.%d.%d.%d\n"
@ -269,9 +259,12 @@ int main(int argc, char *argv[]) {
" - build: %s for %s by %s\n" " - build: %s for %s by %s\n"
" - flags: %s\n" " - flags: %s\n"
" - options: %s\n", " - options: %s\n",
mdbx_version.major, mdbx_version.minor, mdbx_version.patch, mdbx_version.tweak, mdbx_version.git.describe, mdbx_version.major, mdbx_version.minor, mdbx_version.release,
mdbx_version.git.datetime, mdbx_version.git.commit, mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_version.revision, mdbx_version.git.describe,
mdbx_build.datetime, mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, mdbx_build.options); mdbx_version.git.datetime, mdbx_version.git.commit,
mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime,
mdbx_build.target, mdbx_build.compiler, mdbx_build.flags,
mdbx_build.options);
return EXIT_SUCCESS; return EXIT_SUCCESS;
case 'l': case 'l':
list = true; list = true;
@ -284,14 +277,13 @@ int main(int argc, char *argv[]) {
break; break;
case 'f': case 'f':
if (freopen(optarg, "w", stdout) == nullptr) { if (freopen(optarg, "w", stdout) == nullptr) {
fprintf(stderr, "%s: %s: reopen: %s\n", prog, optarg, mdbx_strerror(errno)); fprintf(stderr, "%s: %s: reopen: %s\n", prog, optarg,
mdbx_strerror(errno));
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
break; break;
case 'n': case 'n':
break; envflags |= MDBX_NOSUBDIR;
case 'c':
mode |= CONCISE;
break; break;
case 'p': case 'p':
mode |= PRINT; mode |= PRINT;
@ -307,13 +299,6 @@ int main(int argc, char *argv[]) {
case 'r': case 'r':
rescue = true; rescue = true;
break; break;
case 'u':
warmup = true;
break;
case 'U':
warmup = true;
warmup_flags = MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock;
break;
default: default:
usage(); usage();
} }
@ -337,50 +322,43 @@ int main(int argc, char *argv[]) {
envname = argv[optind]; envname = argv[optind];
if (!quiet) { if (!quiet) {
fprintf(stderr, "mdbx_dump %s (%s, T-%s)\nRunning for %s...\n", mdbx_version.git.describe, fprintf(stderr, "mdbx_dump %s (%s, T-%s)\nRunning for %s...\n",
mdbx_version.git.datetime, mdbx_version.git.tree, envname); mdbx_version.git.describe, mdbx_version.git.datetime,
mdbx_version.git.tree, envname);
fflush(nullptr); fflush(nullptr);
mdbx_setup_debug(MDBX_LOG_NOTICE, MDBX_DBG_DONTCHANGE, logger);
} }
err = mdbx_env_create(&env); rc = mdbx_env_create(&env);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_env_create", err); error("mdbx_env_create", rc);
return EXIT_FAILURE; return EXIT_FAILURE;
} }
if (alldbs || subname) { if (alldbs || subname) {
err = mdbx_env_set_maxdbs(env, 2); rc = mdbx_env_set_maxdbs(env, 2);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_env_set_maxdbs", err); error("mdbx_env_set_maxdbs", rc);
goto env_close; goto env_close;
} }
} }
err = mdbx_env_open(env, envname, envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION : MDBX_RDONLY), rc = mdbx_env_open(
0); env, envname,
if (unlikely(err != MDBX_SUCCESS)) { envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE : MDBX_RDONLY), 0);
error("mdbx_env_open", err); if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_env_open", rc);
goto env_close; goto env_close;
} }
if (warmup) { rc = mdbx_txn_begin(env, nullptr, MDBX_RDONLY, &txn);
err = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); if (unlikely(rc != MDBX_SUCCESS)) {
if (MDBX_IS_ERROR(err)) { error("mdbx_txn_begin", rc);
error("mdbx_env_warmup", err);
goto env_close;
}
}
err = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
if (unlikely(err != MDBX_SUCCESS)) {
error("mdbx_txn_begin", err);
goto env_close; goto env_close;
} }
err = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &dbi); rc = mdbx_dbi_open(txn, subname, MDBX_ACCEDE, &dbi);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_dbi_open", err); error("mdbx_dbi_open", rc);
goto txn_abort; goto txn_abort;
} }
@ -388,49 +366,43 @@ int main(int argc, char *argv[]) {
assert(dbi == MAIN_DBI); assert(dbi == MAIN_DBI);
MDBX_cursor *cursor; MDBX_cursor *cursor;
err = mdbx_cursor_open(txn, MAIN_DBI, &cursor); rc = mdbx_cursor_open(txn, MAIN_DBI, &cursor);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_cursor_open", err); error("mdbx_cursor_open", rc);
goto txn_abort; goto txn_abort;
} }
if (rescue) { if (MDBX_DEBUG > 0 && rescue) {
err = mdbx_cursor_ignord(cursor); cursor->mc_flags |= C_SKIPORD;
if (unlikely(err != MDBX_SUCCESS)) { if (cursor->mc_xcursor)
error("mdbx_cursor_ignord", err); cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD;
return err;
}
} }
bool have_raw = false; bool have_raw = false;
int count = 0; int count = 0;
MDBX_val key; MDBX_val key;
while (MDBX_SUCCESS == (err = mdbx_cursor_get(cursor, &key, nullptr, MDBX_NEXT_NODUP))) { while (MDBX_SUCCESS ==
(rc = mdbx_cursor_get(cursor, &key, nullptr, MDBX_NEXT_NODUP))) {
if (user_break) { if (user_break) {
err = MDBX_EINTR; rc = MDBX_EINTR;
break; break;
} }
if (memchr(key.iov_base, '\0', key.iov_len)) if (memchr(key.iov_base, '\0', key.iov_len))
continue; continue;
subname = osal_realloc(buf4free, key.iov_len + 1); subname = mdbx_malloc(key.iov_len + 1);
if (!subname) {
err = MDBX_ENOMEM;
break;
}
buf4free = subname;
memcpy(subname, key.iov_base, key.iov_len); memcpy(subname, key.iov_base, key.iov_len);
subname[key.iov_len] = '\0'; subname[key.iov_len] = '\0';
MDBX_dbi sub_dbi; MDBX_dbi sub_dbi;
err = mdbx_dbi_open_ex(txn, subname, MDBX_DB_ACCEDE, &sub_dbi, rescue ? equal_or_greater : nullptr, rc = mdbx_dbi_open_ex(txn, subname, MDBX_ACCEDE, &sub_dbi,
rescue ? equal_or_greater : nullptr); rescue ? equal_or_greater : nullptr,
if (unlikely(err != MDBX_SUCCESS)) { rescue ? equal_or_greater : nullptr);
if (err == MDBX_INCOMPATIBLE) { if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_INCOMPATIBLE) {
have_raw = true; have_raw = true;
continue; continue;
} }
error("mdbx_dbi_open", err); error("mdbx_dbi_open", rc);
if (!rescue) if (!rescue)
break; break;
} else { } else {
@ -438,12 +410,12 @@ int main(int argc, char *argv[]) {
if (list) { if (list) {
printf("%s\n", subname); printf("%s\n", subname);
} else { } else {
err = dump_tbl(txn, sub_dbi, subname); rc = dump_sdb(txn, sub_dbi, subname);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
if (!rescue) if (!rescue)
break; break;
if (!quiet) fprintf(stderr, "%s: %s: ignore %s for `%s` and continue\n", prog,
fprintf(stderr, "%s: %s: ignore %s for `%s` and continue\n", prog, envname, mdbx_strerror(err), subname); envname, mdbx_strerror(rc), subname);
/* Here is a hack for rescue mode, don't do that: /* Here is a hack for rescue mode, don't do that:
* - we should restart transaction in case error due * - we should restart transaction in case error due
* database corruption; * database corruption;
@ -452,51 +424,51 @@ int main(int argc, char *argv[]) {
* - this is possible since DB is opened in read-only exclusive * - this is possible since DB is opened in read-only exclusive
* mode and transaction is the same, i.e. has the same address * mode and transaction is the same, i.e. has the same address
* and so on. */ * and so on. */
err = mdbx_txn_reset(txn); rc = mdbx_txn_reset(txn);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_txn_reset", err); error("mdbx_txn_reset", rc);
goto env_close; goto env_close;
} }
err = mdbx_txn_renew(txn); rc = mdbx_txn_renew(txn);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_txn_renew", err); error("mdbx_txn_renew", rc);
goto env_close; goto env_close;
} }
} }
} }
err = mdbx_dbi_close(env, sub_dbi); rc = mdbx_dbi_close(env, sub_dbi);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
error("mdbx_dbi_close", err); error("mdbx_dbi_close", rc);
break; break;
} }
} }
mdbx_free(subname);
} }
mdbx_cursor_close(cursor); mdbx_cursor_close(cursor);
cursor = nullptr; cursor = nullptr;
if (have_raw && (!count /* || rescue */)) if (have_raw && (!count /* || rescue */))
err = dump_tbl(txn, MAIN_DBI, nullptr); rc = dump_sdb(txn, MAIN_DBI, nullptr);
else if (!count) { else if (!count) {
if (!quiet) fprintf(stderr, "%s: %s does not contain multiple databases\n", prog,
fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, envname); envname);
err = MDBX_NOTFOUND; rc = MDBX_NOTFOUND;
} }
} else { } else {
err = dump_tbl(txn, dbi, subname); rc = dump_sdb(txn, dbi, subname);
} }
switch (err) { switch (rc) {
case MDBX_NOTFOUND: case MDBX_NOTFOUND:
err = MDBX_SUCCESS; rc = MDBX_SUCCESS;
case MDBX_SUCCESS: case MDBX_SUCCESS:
break; break;
case MDBX_EINTR: case MDBX_EINTR:
if (!quiet) fprintf(stderr, "Interrupted by signal/user\n");
fprintf(stderr, "Interrupted by signal/user\n");
break; break;
default: default:
if (unlikely(err != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
error("mdbx_cursor_get", err); error("mdbx_cursor_get", rc);
} }
mdbx_dbi_close(env, dbi); mdbx_dbi_close(env, dbi);
@ -504,7 +476,6 @@ txn_abort:
mdbx_txn_abort(txn); mdbx_txn_abort(txn);
env_close: env_close:
mdbx_env_close(env); mdbx_env_close(env);
free(buf4free);
return err ? EXIT_FAILURE : EXIT_SUCCESS; return rc ? EXIT_FAILURE : EXIT_SUCCESS;
} }

Some files were not shown because too many files have changed in this diff Show More