From 2dc3e1ee5fe7c84cfe548e346cacb615f4b302a6 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 21 Feb 2017 20:16:54 +0300 Subject: [PATCH] mdbx: reformat and some cleanup (1/3 for rebirth). --- .travis.yml | 2 +- CHANGES | 223 - Doxyfile | 1631 ----- Makefile | 39 +- README.md | 6 +- barriers.h | 4 +- lmdb.h | 1557 ----- mdb.c | 10723 -------------------------------- mdb_chk.c | 954 --- mdb_copy.c | 81 - mdb_dump.c | 314 - mdb_load.c | 456 -- mdb_stat.c | 299 - mdbx.c | 12019 ++++++++++++++++++++++++++++++++++-- mdbx.h | 1905 +++++- mdbx_chk.c | 979 +++ mdb_copy.1 => mdbx_copy.1 | 8 +- mdbx_copy.c | 76 + mdb_dump.1 => mdbx_dump.1 | 12 +- mdbx_dump.c | 316 + mdb_load.1 => mdbx_load.1 | 14 +- mdbx_load.c | 466 ++ mdb_stat.1 => mdbx_stat.1 | 8 +- mdbx_stat.c | 306 + midl.c | 361 -- midl.h | 209 +- mtest0.c | 337 +- mtest1.c | 309 +- mtest2.c | 219 +- mtest3.c | 233 +- mtest4.c | 294 +- mtest5.c | 237 +- mtest6.c | 194 +- sample-mdb.txt | 28 +- wbench.c | 339 +- yota_test1.c | 367 +- yota_test2.c | 445 +- 37 files changed, 16897 insertions(+), 19073 deletions(-) delete mode 100644 CHANGES delete mode 100644 Doxyfile delete mode 100644 lmdb.h delete mode 100644 mdb.c delete mode 100644 mdb_chk.c delete mode 100644 mdb_copy.c delete mode 100644 mdb_dump.c delete mode 100644 mdb_load.c delete mode 100644 mdb_stat.c create mode 100644 mdbx_chk.c rename mdb_copy.1 => mdbx_copy.1 (94%) create mode 100644 mdbx_copy.c rename mdb_dump.1 => mdbx_dump.1 (94%) create mode 100644 mdbx_dump.c rename mdb_load.1 => mdbx_load.1 (94%) create mode 100644 mdbx_load.c rename mdb_stat.1 => mdbx_stat.1 (95%) create mode 100644 mdbx_stat.c delete mode 100644 midl.c diff --git a/.travis.yml b/.travis.yml index 27287a04..5b6d5ee5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,4 +12,4 @@ compiler: os: - linux -script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all lmdb check; fi +script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi diff --git a/CHANGES b/CHANGES deleted file mode 100644 index 93486855..00000000 --- a/CHANGES +++ /dev/null @@ -1,223 +0,0 @@ -MDBX - Add MDB_PREV_MULTIPLE - Add error MDB_PROBLEM, replace some MDB_CORRUPTED - Workarounds for glibc bugs: #21031 and 21032. - -LMDB 0.9.20 Release Engineering - Fix mdb_load with escaped plaintext (ITS#8558) - Fix mdb_cursor_last / mdb_put interaction (ITS#8557) - -LMDB 0.9.19 Release (2016/12/28) - Fix mdb_env_cwalk cursor init (ITS#8424) - Fix robust mutexes on Solaris 10/11 (ITS#8339) - Fix MDB_GET_BOTH on non-dup record (ITS#8393) - Optimize mdb_drop - Fix xcursors after mdb_cursor_del (ITS#8406) - Fix MDB_NEXT_DUP after mdb_cursor_del (ITS#8412) - Fix mdb_cursor_put resetting C_EOF (ITS#8489) - Fix mdb_env_copyfd2 to return EPIPE on SIGPIPE (ITS#8504) - Fix mdb_env_copy with empty DB (ITS#8209) - Fix behaviors with fork (ITS#8505) - Fix mdb_dbi_open with mainDB cursors (ITS#8542) - Fix F_NOCACHE on MacOS, error is non-fatal (ITS#7682) - Documentation - Cleanup doxygen nits - Note reserved vs actual mem/disk usage - - -LMDB 0.9.18 Release (2016/02/05) - already done for mdbx - Fix robust mutex detection on glibc 2.10-11 (ITS#8330) - Fix page_search_root assert on FreeDB (ITS#8336) - Fix MDB_APPENDDUP vs. rewrite(single item) (ITS#8334) - n/a for mdbx - Fix mdb_copy of large files on Windows - Fix subcursor move after delete (ITS#8355) - Fix mdb_midl_shrink off-by-one (ITS#8363) - n/a for mdbx - Check for utf8_to_utf16 failures (ITS#7992) - Catch strdup failure in mdb_dbi_open - Build - already done for mdbx - Additional makefile var tweaks (ITS#8169) - Documentation - Add Getting Started page - Update WRITEMAP description - -LMDB 0.9.17 Release (2015/11/30) - Fix ITS#7377 catch calloc failure - Fix ITS#8237 regression from ITS#7589 - Fix ITS#8238 page_split for DUPFIXED pages - Fix ITS#8221 MDB_PAGE_FULL on delete/rebalance - Fix ITS#8258 rebalance/split assert - Fix ITS#8263 cursor_put cursor tracking - Fix ITS#8264 cursor_del cursor tracking - Fix ITS#8310 cursor_del cursor tracking - Fix ITS#8299 mdb_del cursor tracking - Fix ITS#8300 mdb_del cursor tracking - Fix ITS#8304 mdb_del cursor tracking - Fix ITS#7771 fakepage cursor tracking - Fix ITS#7789 ensure mapsize >= pages in use - Fix ITS#7971 mdb_txn_renew0() new reader slots - already done for mdbx - Fix ITS#7969 use __sync_synchronize on non-x86 - Fix ITS#8311 page_split from update_key - Fix ITS#8312 loose pages in nested txn - Fix ITS#8313 mdb_rebalance dummy cursor - Fix ITS#8315 dirty_room in nested txn - Fix ITS#8323 dirty_list in nested txn - Fix ITS#8316 page_merge cursor tracking - Fix ITS#8319 mdb_load error messages - Fix ITS#8320 mdb_load plaintext input - Fix ITS#8321 cursor tracking - Added mdb_txn_id() (ITS#7994) - Added robust mutex support - Miscellaneous cleanup/simplification - Build - Create install dirs if needed (ITS#8256) - not affected mdbx - Fix ThreadProc decl on Win32/MSVC (ITS#8270) - not affected mdbx - Added ssize_t typedef for MSVC (ITS#8067) - not affected mdbx - Use ANSI apis on Windows (ITS#8069) - already done for mdbx - Use O_SYNC if O_DSYNC,MDB_DSYNC are not defined (ITS#7209) - already done for mdbx - Allow passing AR to make (ITS#8168) - Allow passing mandir to make install (ITS#8169) - -LMDB 0.9.16 Release (2015/08/14) - Fix cursor EOF bug (ITS#8190) - Fix handling of subDB records (ITS#8181) - Fix mdb_midl_shrink() usage (ITS#8200) - not affected mdbx - fix reference to EINTR on WIN32 from ITS#8106 (ITS#8192) - -LMDB 0.9.15 Release (2015/06/19) - Fix txn init (ITS#7961,#7987) - Fix MDB_PREV_DUP (ITS#7955,#7671) - Fix compact of empty env (ITS#7956) - Fix mdb_copy file mode - Fix mdb_env_close() after failed mdb_env_open() - Fix mdb_rebalance collapsing root (ITS#8062) - Fix mdb_load with large values (ITS#8066) - Fix to retry writes on EINTR (ITS#8106) - Fix mdb_cursor_del on empty DB (ITS#8109) - Fix and Rework comparison for MDB_INTEGERKEY/MDB_INTEGERDUP (ITS#8117) - Fix error handling (ITS#7959,#8157,etc.) - Fix race conditions (ITS#7969,7970) - Added workaround for fdatasync bug in ext3fs - Build - Don't use -fPIC for static lib - Update .gitignore (ITS#7952,#7953) - Cleanup for "make test" (ITS#7841), "make clean", mtest*.c - Misc. Android/Windows cleanup - Documentation - Fix MDB_APPEND doc - Fix MDB_MAXKEYSIZE doc (ITS#8156) - Fix mdb_cursor_put,mdb_cursor_del EACCES description - Fix mdb_env_sync(MDB_RDONLY env) doc (ITS#8021) - Clarify MDB_WRITEMAP doc (ITS#8021) - Clarify mdb_env_open doc - Clarify mdb_dbi_open doc - -LMDB 0.9.14 Release (2014/09/20) - Fix to support 64K page size (ITS#7713) - Fix to persist decreased as well as increased mapsizes (ITS#7789) - Fix cursor bug when deleting last node of a DUPSORT key - Fix mdb_env_info to return FIXEDMAP address - Fix ambiguous error code from writing to closed DBI (ITS#7825) - Fix mdb_copy copying past end of file (ITS#7886) - Fix cursor bugs from page_merge/rebalance - Fix to dirty fewer pages in deletes (mdb_page_loose()) - Fix mdb_dbi_open creating subDBs (ITS#7917) - Fix mdb_cursor_get(_DUP) with single value (ITS#7913) - Fix Windows compat issues in mtests (ITS#7879) - Add compacting variant of mdb_copy - Add BigEndian integer key compare code - Add mdb_dump/mdb_load utilities - -LMDB 0.9.13 Release (2014/06/18) - Fix mdb_page_alloc unlimited overflow page search - Documentation - Re-fix MDB_CURRENT doc (ITS#7793) - Fix MDB_GET_MULTIPLE/MDB_NEXT_MULTIPLE doc - -LMDB 0.9.12 Release (2014/06/13) - Fix MDB_GET_BOTH regression (ITS#7875,#7681) - Fix MDB_MULTIPLE writing multiple keys (ITS#7834) - Fix mdb_rebalance (ITS#7829) - Fix mdb_page_split (ITS#7815) - Fix md_entries count (ITS#7861,#7828,#7793) - Fix MDB_CURRENT (ITS#7793) - Fix possible crash on Windows DLL detach - Misc code cleanup - Documentation - mdb_cursor_put: cursor moves on error (ITS#7771) - - -LMDB 0.9.11 Release (2014/01/15) - Add mdb_env_set_assert() (ITS#7775) - Fix: invalidate txn on page allocation errors (ITS#7377) - Fix xcursor tracking in mdb_cursor_del0() (ITS#7771) - Fix corruption from deletes (ITS#7756) - Fix Windows/MSVC build issues - Raise safe limit of max MDB_MAXKEYSIZE - Misc code cleanup - Documentation - Remove spurious note about non-overlapping flags (ITS#7665) - -LMDB 0.9.10 Release (2013/11/12) - Add MDB_NOMEMINIT option - Fix mdb_page_split() again (ITS#7589) - Fix MDB_NORDAHEAD definition (ITS#7734) - Fix mdb_cursor_del() positioning (ITS#7733) - Partial fix for larger page sizes (ITS#7713) - Fix Windows64/MSVC build issues - -LMDB 0.9.9 Release (2013/10/24) - Add mdb_env_get_fd() - Add MDB_NORDAHEAD option - Add MDB_NOLOCK option - Avoid wasting space in mdb_page_split() (ITS#7589) - Fix mdb_page_merge() cursor fixup (ITS#7722) - Fix mdb_cursor_del() on last delete (ITS#7718) - Fix adding WRITEMAP on existing env (ITS#7715) - Fix nested txns (ITS#7515) - Fix mdb_env_copy() O_DIRECT bug (ITS#7682) - Fix mdb_cursor_set(SET_RANGE) return code (ITS#7681) - Fix mdb_rebalance() cursor fixup (ITS#7701) - Misc code cleanup - Documentation - Note that by default, readers need write access - - -LMDB 0.9.8 Release (2013/09/09) - Allow mdb_env_set_mapsize() on an open environment - Fix mdb_dbi_flags() (ITS#7672) - Fix mdb_page_unspill() in nested txns - Fix mdb_cursor_get(CURRENT|NEXT) after a delete - Fix mdb_cursor_get(DUP) to always return key (ITS#7671) - Fix mdb_cursor_del() to always advance to next item (ITS#7670) - Fix mdb_cursor_set(SET_RANGE) for tree with single page (ITS#7681) - Fix mdb_env_copy() retry open if O_DIRECT fails (ITS#7682) - Tweak mdb_page_spill() to be less aggressive - Documentation - Update caveats since mdb_reader_check() added in 0.9.7 - -LMDB 0.9.7 Release (2013/08/17) - Don't leave stale lockfile on failed RDONLY open (ITS#7664) - Fix mdb_page_split() ref beyond cursor depth - Fix read txn data race (ITS#7635) - Fix mdb_rebalance (ITS#7536, #7538) - Fix mdb_drop() (ITS#7561) - Misc DEBUG macro fixes - Add MDB_NOTLS envflag - Add mdb_env_copyfd() - Add mdb_txn_env() (ITS#7660) - Add mdb_dbi_flags() (ITS#7661) - Add mdb_env_get_maxkeysize() - Add mdb_env_reader_list()/mdb_env_reader_check() - Add mdb_page_spill/unspill, remove hard txn size limit - Use shorter names for semaphores (ITS#7615) - Build - Fix install target (ITS#7656) - Documentation - Misc updates for cursors, DB handles, data lifetime - -LMDB 0.9.6 Release (2013/02/25) - Many fixes/enhancements - -LMDB 0.9.5 Release (2012/11/30) - Renamed from libmdb to liblmdb - Many fixes/enhancements diff --git a/Doxyfile b/Doxyfile deleted file mode 100644 index 5ca2cfe8..00000000 --- a/Doxyfile +++ /dev/null @@ -1,1631 +0,0 @@ -# Doxyfile 1.7.1 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project -# -# All text after a hash (#) is considered a comment and will be ignored -# The format is: -# TAG = value [value, ...] -# For lists items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (" ") - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all -# text before the first occurrence of this tag. Doxygen uses libiconv (or the -# iconv built into libc) for the transcoding. See -# http://www.gnu.org/software/libiconv for the list of possible encodings. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded -# by quotes) that should identify the project. - -PROJECT_NAME = LMDB - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. -# This could be handy for archiving the generated documentation or -# if some version control system is used. - -PROJECT_NUMBER = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) -# base path where the generated documentation will be put. -# If a relative path is entered, it will be relative to the location -# where doxygen was started. If left blank the current directory will be used. - -OUTPUT_DIRECTORY = - -# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create -# 4096 sub-directories (in 2 levels) under the output directory of each output -# format and will distribute the generated files over these directories. -# Enabling this option can be useful when feeding doxygen a huge amount of -# source files, where putting all generated files in the same directory would -# otherwise cause performance problems for the file system. - -CREATE_SUBDIRS = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# The default language is English, other supported languages are: -# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, -# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, -# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English -# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, -# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, -# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will -# include brief member descriptions after the members that are listed in -# the file and class documentation (similar to JavaDoc). -# Set to NO to disable this. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend -# the brief description of a member or function before the detailed description. -# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator -# that is used to form the text in various listings. Each string -# in this list, if found as the leading text of the brief description, will be -# stripped from the text and the result after processing the whole list, is -# used as the annotated text. Otherwise, the brief description is used as-is. -# If left blank, the following values are used ("$name" is automatically -# replaced with the name of the entity): "The $name class" "The $name widget" -# "The $name file" "is" "provides" "specifies" "contains" -# "represents" "a" "an" "the" - -ABBREVIATE_BRIEF = - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# Doxygen will generate a detailed section even if there is only a brief -# description. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full -# path before files name in the file list and in the header files. If set -# to NO the shortest path that makes the file name unique will be used. - -FULL_PATH_NAMES = YES - -# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag -# can be used to strip a user-defined part of the path. Stripping is -# only done if one of the specified strings matches the left-hand part of -# the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the -# path to strip. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of -# the path mentioned in the documentation of a class, which tells -# the reader which header file to include in order to use a class. -# If left blank only the name of the header file containing the class -# definition is used. Otherwise one should specify the include paths that -# are normally passed to the compiler using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter -# (but less readable) file names. This can be useful is your file systems -# doesn't support long names like on DOS, Mac, or CD-ROM. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen -# will interpret the first line (until the first dot) of a JavaDoc-style -# comment as the brief description. If set to NO, the JavaDoc -# comments will behave just like regular Qt-style comments -# (thus requiring an explicit @brief command for a brief description.) - -JAVADOC_AUTOBRIEF = NO - -# If the QT_AUTOBRIEF tag is set to YES then Doxygen will -# interpret the first line (until the first dot) of a Qt-style -# comment as the brief description. If set to NO, the comments -# will behave just like regular Qt-style comments (thus requiring -# an explicit \brief command for a brief description.) - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen -# treat a multi-line C++ special comment block (i.e. a block of //! or /// -# comments) as a brief description. This used to be the default behaviour. -# The new default is to treat a multi-line C++ comment block as a detailed -# description. Set this tag to YES if you prefer the old behaviour instead. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented -# member inherits the documentation from any documented member that it -# re-implements. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce -# a new page for each member. If set to NO, the documentation of a member will -# be part of the file/class/namespace that contains it. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. -# Doxygen uses this value to replace tabs by spaces in code fragments. - -TAB_SIZE = 4 - -# This tag can be used to specify a number of aliases that acts -# as commands in the documentation. An alias has the form "name=value". -# For example adding "sideeffect=\par Side Effects:\n" will allow you to -# put the command \sideeffect (or @sideeffect) in the documentation, which -# will result in a user-defined paragraph with heading "Side Effects:". -# You can put \n's in the value part of an alias to insert newlines. - -ALIASES = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C -# sources only. Doxygen will then generate output that is more tailored for C. -# For instance, some of the names that are used will be different. The list -# of all members will be omitted, etc. - -OPTIMIZE_OUTPUT_FOR_C = YES - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java -# sources only. Doxygen will then generate output that is more tailored for -# Java. For instance, namespaces will be presented as packages, qualified -# scopes will look different, etc. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources only. Doxygen will then generate output that is more tailored for -# Fortran. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for -# VHDL. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given extension. -# Doxygen has a built-in mapping, but you can override or extend it using this -# tag. The format is ext=language, where ext is a file extension, and language -# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, -# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make -# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C -# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions -# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should -# set this tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. -# func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. -# Doxygen will parse them like normal C++ but will assume all classes use public -# instead of private inheritance when no explicit protection keyword is present. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate getter -# and setter methods for a property. Setting this option to YES (the default) -# will make doxygen to replace the get and set methods by a property in the -# documentation. This will only work if the methods are indeed getting or -# setting a simple type. If this is not the case, or you want to show the -# methods anyway, you should set this option to NO. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES, then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. - -DISTRIBUTE_GROUP_DOC = YES - -# Set the SUBGROUPING tag to YES (the default) to allow class member groups of -# the same type (for instance a group of public functions) to be put as a -# subgroup of that type (e.g. under the Public Functions section). Set it to -# NO to prevent subgrouping. Alternatively, this can be done per class using -# the \nosubgrouping command. - -SUBGROUPING = YES - -INLINE_GROUPED_CLASSES = YES -# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum -# is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically -# be useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. - -TYPEDEF_HIDES_STRUCT = YES - -# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to -# determine which symbols to keep in memory and which to flush to disk. -# When the cache is full, less often used symbols will be written to disk. -# For small to medium size projects (<1000 input files) the default value is -# probably good enough. For larger projects a too small cache size can cause -# doxygen to be busy swapping symbols to and from disk most of the time -# causing a significant performance penality. -# If the system has enough physical memory increasing the cache will improve the -# performance by keeping more symbols in memory. Note that the value works on -# a logarithmic scale so increasing the size by one will rougly double the -# memory usage. The cache size is given by this formula: -# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols - -SYMBOL_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in -# documentation are documented, even if no documentation was available. -# Private class members and static file members will be hidden unless -# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES - -EXTRACT_ALL = NO - -# If the EXTRACT_PRIVATE tag is set to YES all private members of a class -# will be included in the documentation. - -EXTRACT_PRIVATE = NO - -# If the EXTRACT_STATIC tag is set to YES all static members of a file -# will be included in the documentation. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) -# defined locally in source files will be included in the documentation. -# If set to NO only classes defined in header files are included. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. When set to YES local -# methods, which are defined in the implementation section but not in -# the interface are included in the documentation. -# If set to NO (the default) only methods in the interface are included. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base -# name of the file that contains the anonymous namespace. By default -# anonymous namespace are hidden. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all -# undocumented members of documented classes, files or namespaces. -# If set to NO (the default) these members will be included in the -# various overviews, but no documentation section is generated. -# This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. -# If set to NO (the default) these classes will be included in the various -# overviews. This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all -# friend (class|struct|union) declarations. -# If set to NO (the default) these declarations will be included in the -# documentation. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any -# documentation blocks found inside the body of a function. -# If set to NO (the default) these blocks will be appended to the -# function's detailed documentation block. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation -# that is typed after a \internal command is included. If the tag is set -# to NO (the default) then the documentation will be excluded. -# Set it to YES to include the internal documentation. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate -# file names in lower-case letters. If set to YES upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. - -CASE_SENSE_NAMES = YES - -# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen -# will show members with their full class and namespace scopes in the -# documentation. If set to YES the scope will be hidden. - -HIDE_SCOPE_NAMES = NO - -# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen -# will put a list of the files that are included by a file in the documentation -# of that file. - -SHOW_INCLUDE_FILES = YES - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen -# will list include files with double quotes in the documentation -# rather than with sharp brackets. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] -# is inserted in the documentation for inline members. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen -# will sort the (detailed) documentation of file and class members -# alphabetically by member name. If set to NO the members will appear in -# declaration order. - -SORT_MEMBER_DOCS = NO - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the -# brief documentation of file, namespace and class members alphabetically -# by member name. If set to NO (the default) the members will appear in -# declaration order. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen -# will sort the (brief and detailed) documentation of class members so that -# constructors and destructors are listed first. If set to NO (the default) -# the constructors will appear in the respective orders defined by -# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. -# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO -# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the -# hierarchy of group names into alphabetical order. If set to NO (the default) -# the group names will appear in their defined order. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be -# sorted by fully-qualified names, including namespaces. If set to -# NO (the default), the class list will be sorted only by class name, -# not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the -# alphabetical list. - -SORT_BY_SCOPE_NAME = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or -# disable (NO) the todo list. This list is created by putting \todo -# commands in the documentation. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or -# disable (NO) the test list. This list is created by putting \test -# commands in the documentation. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or -# disable (NO) the bug list. This list is created by putting \bug -# commands in the documentation. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or -# disable (NO) the deprecated list. This list is created by putting -# \deprecated commands in the documentation. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional -# documentation sections, marked by \if sectionname ... \endif. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines -# the initial value of a variable or define consists of for it to appear in -# the documentation. If the initializer consists of more lines than specified -# here it will be hidden. Use a value of 0 to hide initializers completely. -# The appearance of the initializer of individual variables and defines in the -# documentation can be controlled using \showinitializer or \hideinitializer -# command in the documentation regardless of this setting. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated -# at the bottom of the documentation of classes and structs. If set to YES the -# list will mention the files that were used to generate the documentation. - -SHOW_USED_FILES = YES - -# If the sources in your project are distributed over multiple directories -# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy -# in the documentation. The default is NO. - -SHOW_DIRECTORIES = NO - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. -# This will remove the Files entry from the Quick Index and from the -# Folder Tree View (if specified). The default is YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the -# Namespaces page. -# This will remove the Namespaces entry from the Quick Index -# and from the Folder Tree View (if specified). The default is YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command , where is the value of -# the FILE_VERSION_FILTER tag, and is the name of an input file -# provided by doxygen. Whatever the program writes to standard output -# is used as the file version. See the manual for examples. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. The create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. -# You can optionally specify a file name after the option, if omitted -# DoxygenLayout.xml will be used as the name of the layout file. - -LAYOUT_FILE = - -#--------------------------------------------------------------------------- -# configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated -# by doxygen. Possible values are YES and NO. If left blank NO is used. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated by doxygen. Possible values are YES and NO. If left blank -# NO is used. - -WARNINGS = YES - -# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings -# for undocumented members. If EXTRACT_ALL is set to YES then this flag will -# automatically be disabled. - -WARN_IF_UNDOCUMENTED = YES - -# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some -# parameters in a documented function, or documenting parameters that -# don't exist or using markup commands wrongly. - -WARN_IF_DOC_ERROR = YES - -# This WARN_NO_PARAMDOC option can be abled to get warnings for -# functions that are documented, but have no documentation for their parameters -# or return value. If set to NO (the default) doxygen will only warn about -# wrong or incomplete parameter documentation, but not about the absence of -# documentation. - -WARN_NO_PARAMDOC = NO - -# The WARN_FORMAT tag determines the format of the warning messages that -# doxygen can produce. The string should contain the $file, $line, and $text -# tags, which will be replaced by the file and line number from which the -# warning originated and the warning text. Optionally the format may contain -# $version, which will be replaced by the version of the file (if it could -# be obtained via FILE_VERSION_FILTER) - -WARN_FORMAT = "$file:$line: $text" - -# The WARN_LOGFILE tag can be used to specify a file to which warning -# and error messages should be written. If left blank the output is written -# to stderr. - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag can be used to specify the files and/or directories that contain -# documented source files. You may enter file names like "myfile.cpp" or -# directories like "/usr/src/myproject". Separate the files or directories -# with spaces. - -INPUT = lmdb.h midl.h mdb.c midl.c intro.doc - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is -# also the default input encoding. Doxygen uses libiconv (or the iconv built -# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for -# the list of possible encodings. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank the following patterns are tested: -# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx -# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 - -FILE_PATTERNS = - -# The RECURSIVE tag can be used to turn specify whether or not subdirectories -# should be searched for input files as well. Possible values are YES and NO. -# If left blank NO is used. - -RECURSIVE = NO - -# The EXCLUDE tag can be used to specify files and/or directories that should -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. - -EXCLUDE = - -# The EXCLUDE_SYMLINKS tag can be used select whether or not files or -# directories that are symbolic links (a Unix filesystem feature) are excluded -# from the input. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. Note that the wildcards are matched -# against the file with absolute path, so to exclude all test directories -# for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or -# directories that contain example code fragments that are included (see -# the \include command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank all files are included. - -EXAMPLE_PATTERNS = - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude -# commands irrespective of the value of the RECURSIVE tag. -# Possible values are YES and NO. If left blank NO is used. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or -# directories that contain image that are included in the documentation (see -# the \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command , where -# is the value of the INPUT_FILTER tag, and is the name of an -# input file. Doxygen will then use the output that the filter program writes -# to standard output. -# If FILTER_PATTERNS is specified, this tag will be -# ignored. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. -# Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. -# The filters are a list of the form: -# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further -# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER -# is applied to all files. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will be used to filter the input files when producing source -# files to browse (i.e. when SOURCE_BROWSER is set to YES). - -FILTER_SOURCE_FILES = NO - -#--------------------------------------------------------------------------- -# configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will -# be generated. Documented entities will be cross-referenced with these sources. -# Note: To get rid of all source code in the generated output, make sure also -# VERBATIM_HEADERS is set to NO. - -SOURCE_BROWSER = NO - -# Setting the INLINE_SOURCES tag to YES will include the body -# of functions and classes directly in the documentation. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct -# doxygen to hide any special comment blocks from generated source code -# fragments. Normal C and C++ comments will always remain visible. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES -# then for each documented function all documented -# functions referencing it will be listed. - -REFERENCED_BY_RELATION = NO - -# If the REFERENCES_RELATION tag is set to YES -# then for each documented function all documented entities -# called/used by that function will be listed. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) -# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from -# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will -# link to the source code. -# Otherwise they will link to the documentation. - -REFERENCES_LINK_SOURCE = YES - -# If the USE_HTAGS tag is set to YES then the references to source code -# will point to the HTML generated by the htags(1) tool instead of doxygen -# built-in source browser. The htags tool is part of GNU's global source -# tagging system (see http://www.gnu.org/software/global/global.html). You -# will need version 4.8.6 or higher. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen -# will generate a verbatim copy of the header file for each class for -# which an include is specified. Set to NO to disable this. - -VERBATIM_HEADERS = YES - -#--------------------------------------------------------------------------- -# configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index -# of all compounds will be generated. Enable this if the project -# contains a lot of classes, structs, unions or interfaces. - -ALPHABETICAL_INDEX = YES - -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all -# classes will be put under the same header in the alphabetical index. -# The IGNORE_PREFIX tag can be used to specify one or more prefixes that -# should be ignored while generating the index headers. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES (the default) Doxygen will -# generate HTML output. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `html' will be used as the default path. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for -# each generated HTML page (for example: .htm,.php,.asp). If it is left blank -# doxygen will generate files with .html extension. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a personal HTML header for -# each generated HTML page. If it is left blank doxygen will generate a -# standard header. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a personal HTML footer for -# each generated HTML page. If it is left blank doxygen will generate a -# standard footer. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading -# style sheet that is used by each HTML page. It can be used to -# fine-tune the look of the HTML output. If the tag is left blank doxygen -# will generate a default style sheet. Note that doxygen will try to copy -# the style sheet file to the HTML output directory, so don't put your own -# stylesheet in the HTML output directory as well, or it will be erased! - -HTML_STYLESHEET = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. -# Doxygen will adjust the colors in the stylesheet and background images -# according to this color. Hue is specified as an angle on a colorwheel, -# see http://en.wikipedia.org/wiki/Hue for more information. -# For instance the value 0 represents red, 60 is yellow, 120 is green, -# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. -# The allowed range is 0 to 359. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of -# the colors in the HTML output. For a value of 0 the output will use -# grayscales only. A value of 255 will produce the most vivid colors. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to -# the luminance component of the colors in the HTML output. Values below -# 100 gradually make the output lighter, whereas values above 100 make -# the output darker. The value divided by 100 is the actual gamma applied, -# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, -# and 100 does not change the gamma. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting -# this to NO can help when comparing the output of multiple runs. - -HTML_TIMESTAMP = YES - -# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, -# files or namespaces will be aligned in HTML using tables. If set to -# NO a bullet list will be used. - -HTML_ALIGN_MEMBERS = YES - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. For this to work a browser that supports -# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox -# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). - -HTML_DYNAMIC_SECTIONS = NO - -# If the GENERATE_DOCSET tag is set to YES, additional index files -# will be generated that can be used as input for Apple's Xcode 3 -# integrated development environment, introduced with OSX 10.5 (Leopard). -# To create a documentation set, doxygen will generate a Makefile in the -# HTML output directory. Running make will produce the docset in that -# directory and running "make install" will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find -# it at startup. -# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. - -GENERATE_DOCSET = NO - -# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the -# feed. A documentation feed provides an umbrella under which multiple -# documentation sets from a single provider (such as a company or product suite) -# can be grouped. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that -# should uniquely identify the documentation set bundle. This should be a -# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen -# will append .docset to the name. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES, additional index files -# will be generated that can be used as input for tools like the -# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) -# of the generated HTML documentation. - -GENERATE_HTMLHELP = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can -# be used to specify the file name of the resulting .chm file. You -# can add a path in front of the file if the result should not be -# written to the html output directory. - -CHM_FILE = - -# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can -# be used to specify the location (absolute path including file name) of -# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run -# the HTML help compiler on the generated index.hhp. - -HHC_LOCATION = - -# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag -# controls if a separate .chi index file is generated (YES) or that -# it should be included in the master .chm file (NO). - -GENERATE_CHI = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING -# is used to encode HtmlHelp index (hhk), content (hhc) and project file -# content. - -CHM_INDEX_ENCODING = - -# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag -# controls whether a binary table of contents is generated (YES) or a -# normal table of contents (NO) in the .chm file. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members -# to the contents of the HTML help documentation and to the tree view. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated -# that can be used as input for Qt's qhelpgenerator to generate a -# Qt Compressed Help (.qch) of the generated HTML documentation. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can -# be used to specify the file name of the resulting .qch file. -# The path specified is relative to the HTML output folder. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#namespace - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#virtual-folders - -QHP_VIRTUAL_FOLDER = doc - -# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to -# add. For more information please see -# http://doc.trolltech.com/qthelpproject.html#custom-filters - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see -# -# Qt Help Project / Custom Filters. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's -# filter section matches. -# -# Qt Help Project / Filter Attributes. - -QHP_SECT_FILTER_ATTRS = - -# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can -# be used to specify the location of Qt's qhelpgenerator. -# If non-empty doxygen will try to run qhelpgenerator on the generated -# .qhp file. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files -# will be generated, which together with the HTML files, form an Eclipse help -# plugin. To install this plugin and make it available under the help contents -# menu in Eclipse, the contents of the directory containing the HTML and XML -# files needs to be copied into the plugins directory of eclipse. The name of -# the directory within the plugins directory should be the same as -# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before -# the help appears. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have -# this name. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# The DISABLE_INDEX tag can be used to turn on/off the condensed index at -# top of each HTML page. The value NO (the default) enables the index and -# the value YES disables it. - -DISABLE_INDEX = NO - -# This tag can be used to set the number of enum values (range [1..20]) -# that doxygen will group on one line in the generated HTML documentation. - -ENUM_VALUES_PER_LINE = 4 - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. -# If the tag value is set to YES, a side panel will be generated -# containing a tree-like index structure (just like the one that -# is generated for HTML Help). For this to work a browser that supports -# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). -# Windows users are probably better off using the HTML help feature. - -GENERATE_TREEVIEW = NO - -# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, -# and Class Hierarchy pages using a tree view instead of an ordered list. - -USE_INLINE_TREES = NO - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be -# used to set the initial width (in pixels) of the frame in which the tree -# is shown. - -TREEVIEW_WIDTH = 250 - -# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open -# links to external symbols imported via tag files in a separate window. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of Latex formulas included -# as images in the HTML documentation. The default is 10. Note that -# when you change the font size after a successful doxygen run you need -# to manually remove any form_*.png images from the HTML output directory -# to force them to be regenerated. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are -# not supported properly for IE 6.0, but are supported on all modern browsers. -# Note that when changing this option you need to delete any form_*.png files -# in the HTML output before the changes have effect. - -FORMULA_TRANSPARENT = YES - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box -# for the HTML output. The underlying search engine uses javascript -# and DHTML and should work on any modern browser. Note that when using -# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets -# (GENERATE_DOCSET) there is already a search function so this one should -# typically be disabled. For large projects the javascript based search engine -# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. - -SEARCHENGINE = YES - -# When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a PHP enabled web server instead of at the web client -# using Javascript. Doxygen will generate the search PHP script and index -# file to put on the web server. The advantage of the server -# based approach is that it scales better to large projects and allows -# full text search. The disadvances is that it is more difficult to setup -# and does not have live searching capabilities. - -SERVER_BASED_SEARCH = NO - -#--------------------------------------------------------------------------- -# configuration options related to the LaTeX output -#--------------------------------------------------------------------------- - -# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will -# generate Latex output. - -GENERATE_LATEX = NO - -# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `latex' will be used as the default path. - -LATEX_OUTPUT = latex - -# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be -# invoked. If left blank `latex' will be used as the default command name. -# Note that when enabling USE_PDFLATEX this option is only used for -# generating bitmaps for formulas in the HTML output, but not in the -# Makefile that is written to the output directory. - -LATEX_CMD_NAME = latex - -# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to -# generate index for LaTeX. If left blank `makeindex' will be used as the -# default command name. - -MAKEINDEX_CMD_NAME = makeindex - -# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact -# LaTeX documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_LATEX = NO - -# The PAPER_TYPE tag can be used to set the paper type that is used -# by the printer. Possible values are: a4, a4wide, letter, legal and -# executive. If left blank a4wide will be used. - -PAPER_TYPE = a4wide - -# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX -# packages that should be included in the LaTeX output. - -EXTRA_PACKAGES = - -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for -# the generated latex document. The header should contain everything until -# the first chapter. If it is left blank doxygen will generate a -# standard header. Notice: only use this tag if you know what you are doing! - -LATEX_HEADER = - -# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated -# is prepared for conversion to pdf (using ps2pdf). The pdf file will -# contain links (just like the HTML output) instead of page references -# This makes the output suitable for online browsing using a pdf viewer. - -PDF_HYPERLINKS = YES - -# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of -# plain latex in the generated Makefile. Set this option to YES to get a -# higher quality PDF documentation. - -USE_PDFLATEX = YES - -# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. -# command to the generated LaTeX files. This will instruct LaTeX to keep -# running if errors occur, instead of asking the user for help. -# This option is also used when generating formulas in HTML. - -LATEX_BATCHMODE = NO - -# If LATEX_HIDE_INDICES is set to YES then doxygen will not -# include the index chapters (such as File Index, Compound Index, etc.) -# in the output. - -LATEX_HIDE_INDICES = NO - -# If LATEX_SOURCE_CODE is set to YES then doxygen will include -# source code with syntax highlighting in the LaTeX output. -# Note that which sources are shown also depends on other settings -# such as SOURCE_BROWSER. - -LATEX_SOURCE_CODE = NO - -#--------------------------------------------------------------------------- -# configuration options related to the RTF output -#--------------------------------------------------------------------------- - -# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output -# The RTF output is optimized for Word 97 and may not look very pretty with -# other RTF readers or editors. - -GENERATE_RTF = NO - -# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `rtf' will be used as the default path. - -RTF_OUTPUT = rtf - -# If the COMPACT_RTF tag is set to YES Doxygen generates more compact -# RTF documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_RTF = NO - -# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated -# will contain hyperlink fields. The RTF file will -# contain links (just like the HTML output) instead of page references. -# This makes the output suitable for online browsing using WORD or other -# programs which support those fields. -# Note: wordpad (write) and others do not support links. - -RTF_HYPERLINKS = NO - -# Load stylesheet definitions from file. Syntax is similar to doxygen's -# config file, i.e. a series of assignments. You only have to provide -# replacements, missing definitions are set to their default value. - -RTF_STYLESHEET_FILE = - -# Set optional variables used in the generation of an rtf document. -# Syntax is similar to doxygen's config file. - -RTF_EXTENSIONS_FILE = - -#--------------------------------------------------------------------------- -# configuration options related to the man page output -#--------------------------------------------------------------------------- - -# If the GENERATE_MAN tag is set to YES (the default) Doxygen will -# generate man pages - -GENERATE_MAN = YES - -# The MAN_OUTPUT tag is used to specify where the man pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `man' will be used as the default path. - -MAN_OUTPUT = man - -# The MAN_EXTENSION tag determines the extension that is added to -# the generated man pages (default is the subroutine's section .3) - -MAN_EXTENSION = .3 - -# If the MAN_LINKS tag is set to YES and Doxygen generates man output, -# then it will generate one additional man file for each entity -# documented in the real man page(s). These additional files -# only source the real man page, but without them the man command -# would be unable to find the correct page. The default is NO. - -MAN_LINKS = NO - -#--------------------------------------------------------------------------- -# configuration options related to the XML output -#--------------------------------------------------------------------------- - -# If the GENERATE_XML tag is set to YES Doxygen will -# generate an XML file that captures the structure of -# the code including all documentation. - -GENERATE_XML = NO - -# The XML_OUTPUT tag is used to specify where the XML pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `xml' will be used as the default path. - -XML_OUTPUT = xml - -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - -# If the XML_PROGRAMLISTING tag is set to YES Doxygen will -# dump the program listings (including syntax highlighting -# and cross-referencing information) to the XML output. Note that -# enabling this will significantly increase the size of the XML output. - -XML_PROGRAMLISTING = YES - -#--------------------------------------------------------------------------- -# configuration options for the AutoGen Definitions output -#--------------------------------------------------------------------------- - -# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will -# generate an AutoGen Definitions (see autogen.sf.net) file -# that captures the structure of the code including all -# documentation. Note that this feature is still experimental -# and incomplete at the moment. - -GENERATE_AUTOGEN_DEF = NO - -#--------------------------------------------------------------------------- -# configuration options related to the Perl module output -#--------------------------------------------------------------------------- - -# If the GENERATE_PERLMOD tag is set to YES Doxygen will -# generate a Perl module file that captures the structure of -# the code including all documentation. Note that this -# feature is still experimental and incomplete at the -# moment. - -GENERATE_PERLMOD = NO - -# If the PERLMOD_LATEX tag is set to YES Doxygen will generate -# the necessary Makefile rules, Perl scripts and LaTeX code to be able -# to generate PDF and DVI output from the Perl module output. - -PERLMOD_LATEX = NO - -# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be -# nicely formatted so it can be parsed by a human reader. -# This is useful -# if you want to understand what is going on. -# On the other hand, if this -# tag is set to NO the size of the Perl module output will be much smaller -# and Perl will parse it just the same. - -PERLMOD_PRETTY = YES - -# The names of the make variables in the generated doxyrules.make file -# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. -# This is useful so different doxyrules.make files included by the same -# Makefile don't overwrite each other's variables. - -PERLMOD_MAKEVAR_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the preprocessor -#--------------------------------------------------------------------------- - -# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will -# evaluate all C-preprocessor directives found in the sources and include -# files. - -ENABLE_PREPROCESSING = YES - -# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro -# names in the source code. If set to NO (the default) only conditional -# compilation will be performed. Macro expansion can be done in a controlled -# way by setting EXPAND_ONLY_PREDEF to YES. - -MACRO_EXPANSION = NO - -# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES -# then the macro expansion is limited to the macros specified with the -# PREDEFINED and EXPAND_AS_DEFINED tags. - -EXPAND_ONLY_PREDEF = NO - -# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files -# in the INCLUDE_PATH (see below) will be search if a #include is found. - -SEARCH_INCLUDES = YES - -# The INCLUDE_PATH tag can be used to specify one or more directories that -# contain include files that are not input files but should be processed by -# the preprocessor. - -INCLUDE_PATH = - -# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard -# patterns (like *.h and *.hpp) to filter out the header-files in the -# directories. If left blank, the patterns specified with FILE_PATTERNS will -# be used. - -INCLUDE_FILE_PATTERNS = - -# The PREDEFINED tag can be used to specify one or more macro names that -# are defined before the preprocessor is started (similar to the -D option of -# gcc). The argument of the tag is a list of macros of the form: name -# or name=definition (no spaces). If the definition and the = are -# omitted =1 is assumed. To prevent a macro definition from being -# undefined via #undef or recursively expanded use the := operator -# instead of the = operator. - -PREDEFINED = DEBUG=2 __GNUC__=1 - -# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then -# this tag can be used to specify a list of macro names that should be expanded. -# The macro definition that is found in the sources will be used. -# Use the PREDEFINED tag if you want to use a different macro definition. - -EXPAND_AS_DEFINED = - -# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then -# doxygen's preprocessor will remove all function-like macros that are alone -# on a line, have an all uppercase name, and do not end with a semicolon. Such -# function macros are typically used for boiler-plate code, and will confuse -# the parser if not removed. - -SKIP_FUNCTION_MACROS = YES - -#--------------------------------------------------------------------------- -# Configuration::additions related to external references -#--------------------------------------------------------------------------- - -# The TAGFILES option can be used to specify one or more tagfiles. -# Optionally an initial location of the external documentation -# can be added for each tagfile. The format of a tag file without -# this location is as follows: -# -# TAGFILES = file1 file2 ... -# Adding location for the tag files is done as follows: -# -# TAGFILES = file1=loc1 "file2 = loc2" ... -# where "loc1" and "loc2" can be relative or absolute paths or -# URLs. If a location is present for each tag, the installdox tool -# does not have to be run to correct the links. -# Note that each tag file must have a unique name -# (where the name does NOT include the path) -# If a tag file is not located in the directory in which doxygen -# is run, you must also specify the path to the tagfile here. - -TAGFILES = tooltag=./man1 - -# When a file name is specified after GENERATE_TAGFILE, doxygen will create -# a tag file that is based on the input files it reads. - -GENERATE_TAGFILE = - -# If the ALLEXTERNALS tag is set to YES all external classes will be listed -# in the class index. If set to NO only the inherited external classes -# will be listed. - -ALLEXTERNALS = NO - -# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed -# in the modules index. If set to NO, only the current project's groups will -# be listed. - -EXTERNAL_GROUPS = YES - -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of `which perl'). - -PERL_PATH = /usr/bin/perl - -#--------------------------------------------------------------------------- -# Configuration options related to the dot tool -#--------------------------------------------------------------------------- - -# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will -# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base -# or super classes. Setting the tag to NO turns the diagrams off. Note that -# this option is superseded by the HAVE_DOT option below. This is only a -# fallback. It is recommended to install and use dot, since it yields more -# powerful graphs. - -CLASS_DIAGRAMS = YES - -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see -# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - -# If set to YES, the inheritance and collaboration graphs will hide -# inheritance and usage relations if the target is undocumented -# or is not a class. - -HIDE_UNDOC_RELATIONS = YES - -# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is -# available from the path. This tool is part of Graphviz, a graph visualization -# toolkit from AT&T and Lucent Bell Labs. The other options in this section -# have no effect if this option is set to NO (the default) - -HAVE_DOT = NO - -# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is -# allowed to run in parallel. When set to 0 (the default) doxygen will -# base this on the number of processors available in the system. You can set it -# explicitly to a value larger than 0 to get control over the balance -# between CPU load and processing speed. - -DOT_NUM_THREADS = 0 - -# By default doxygen will write a font called FreeSans.ttf to the output -# directory and reference it in all dot files that doxygen generates. This -# font does not include all possible unicode characters however, so when you need -# these (or just want a differently looking font) you can specify the font name -# using DOT_FONTNAME. You need need to make sure dot is able to find the font, -# which can be done by putting it in a standard location or by setting the -# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory -# containing the font. - -DOT_FONTNAME = FreeSans.ttf - -# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. -# The default size is 10pt. - -DOT_FONTSIZE = 10 - -# By default doxygen will tell dot to use the output directory to look for the -# FreeSans.ttf font (which doxygen will put there itself). If you specify a -# different font using DOT_FONTNAME you can set the path where dot -# can find it using this tag. - -DOT_FONTPATH = - -# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect inheritance relations. Setting this tag to YES will force the -# the CLASS_DIAGRAMS tag to NO. - -CLASS_GRAPH = YES - -# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect implementation dependencies (inheritance, containment, and -# class references variables) of the class with other documented classes. - -COLLABORATION_GRAPH = YES - -# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for groups, showing the direct groups dependencies - -GROUP_GRAPHS = YES - -# If the UML_LOOK tag is set to YES doxygen will generate inheritance and -# collaboration diagrams in a style similar to the OMG's Unified Modeling -# Language. - -UML_LOOK = NO - -# If set to YES, the inheritance and collaboration graphs will show the -# relations between templates and their instances. - -TEMPLATE_RELATIONS = NO - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT -# tags are set to YES then doxygen will generate a graph for each documented -# file showing the direct and indirect include dependencies of the file with -# other documented files. - -INCLUDE_GRAPH = YES - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and -# HAVE_DOT tags are set to YES then doxygen will generate a graph for each -# documented header file showing the documented files that directly or -# indirectly include this file. - -INCLUDED_BY_GRAPH = YES - -# If the CALL_GRAPH and HAVE_DOT options are set to YES then -# doxygen will generate a call dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable call graphs -# for selected functions only using the \callgraph command. - -CALL_GRAPH = NO - -# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then -# doxygen will generate a caller dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable caller -# graphs for selected functions only using the \callergraph command. - -CALLER_GRAPH = NO - -# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen -# will graphical hierarchy of all classes instead of a textual one. - -GRAPHICAL_HIERARCHY = YES - -# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES -# then doxygen will show the dependencies a directory has on other directories -# in a graphical way. The dependency relations are determined by the #include -# relations between the files in the directories. - -DIRECTORY_GRAPH = YES - -# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images -# generated by dot. Possible values are png, jpg, or gif -# If left blank png will be used. - -DOT_IMAGE_FORMAT = png - -# The tag DOT_PATH can be used to specify the path where the dot tool can be -# found. If left blank, it is assumed the dot tool can be found in the path. - -DOT_PATH = - -# The DOTFILE_DIRS tag can be used to specify one or more directories that -# contain dot files that are included in the documentation (see the -# \dotfile command). - -DOTFILE_DIRS = - -# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of -# nodes that will be shown in the graph. If the number of nodes in a graph -# becomes larger than this value, doxygen will truncate the graph, which is -# visualized by representing a node as a red box. Note that doxygen if the -# number of direct children of the root node in a graph is already larger than -# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note -# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. - -DOT_GRAPH_MAX_NODES = 50 - -# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the -# graphs generated by dot. A depth value of 3 means that only nodes reachable -# from the root by following a path via at most 3 edges will be shown. Nodes -# that lay further from the root node will be omitted. Note that setting this -# option to 1 or 2 may greatly reduce the computation time needed for large -# code bases. Also note that the size of a graph can be further restricted by -# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. - -MAX_DOT_GRAPH_DEPTH = 0 - -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not -# seem to support this out of the box. Warning: Depending on the platform used, -# enabling this option may lead to badly anti-aliased labels on the edges of -# a graph (i.e. they become hard to read). - -DOT_TRANSPARENT = NO - -# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output -# files in one run (i.e. multiple -o and -T options on the command line). This -# makes dot run faster, but since only newer versions of dot (>1.8.10) -# support this, this feature is disabled by default. - -DOT_MULTI_TARGETS = YES - -# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will -# generate a legend page explaining the meaning of the various boxes and -# arrows in the dot generated graphs. - -GENERATE_LEGEND = YES - -# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will -# remove the intermediate dot files that are used to generate -# the various graphs. - -DOT_CLEANUP = YES diff --git a/Makefile b/Makefile index 66d7e278..4b6c3614 100644 --- a/Makefile +++ b/Makefile @@ -36,24 +36,21 @@ IOARENA ?= ../ioarena.git/@BUILD/src/ioarena ######################################################################## -HEADERS := lmdb.h mdbx.h +HEADERS := mdbx.h LIBRARIES := libmdbx.a libmdbx.so TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk -MANPAGES := mdb_stat.1 mdb_copy.1 mdb_dump.1 mdb_load.1 +MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 TESTS := mtest0 mtest1 mtest2 mtest3 mtest4 mtest5 mtest6 wbench \ yota_test1 yota_test2 -SRC_LMDB := mdb.c midl.c lmdb.h midl.h reopen.h barriers.h -SRC_MDBX := $(SRC_LMDB) mdbx.c mdbx.h +SRC_MDBX := mdbx.c mdbx.h reopen.h barriers.h -.PHONY: mdbx lmdb all install clean check tests coverage +.PHONY: mdbx all install clean check tests coverage all: $(LIBRARIES) $(TOOLS) mdbx: libmdbx.a libmdbx.so -lmdb: liblmdb.a liblmdb.so - tools: $(TOOLS) install: $(LIBRARIES) $(TOOLS) $(HEADERS) @@ -88,25 +85,19 @@ libmdbx.a: mdbx.o libmdbx.so: mdbx.lo $(CC) $(CFLAGS) $(LDFLAGS) -save-temps -pthread -shared $(LDOPS) -o $@ $^ -liblmdb.a: lmdb.o - $(AR) rs $@ $^ - -liblmdb.so: lmdb.lo - $(CC) $(CFLAGS) $(LDFLAGS) -pthread -shared $(LDOPS) -o $@ $^ - -mdbx_stat: mdb_stat.o mdbx.o +mdbx_stat: mdbx_stat.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_copy: mdb_copy.o mdbx.o +mdbx_copy: mdbx_copy.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_dump: mdb_dump.o mdbx.o +mdbx_dump: mdbx_dump.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_load: mdb_load.o mdbx.o +mdbx_load: mdbx_load.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_chk: mdb_chk.o mdbx.o +mdbx_chk: mdbx_chk.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ mtest0: mtest0.o mdbx.o @@ -145,16 +136,10 @@ mdbx.o: $(SRC_MDBX) mdbx.lo: $(SRC_MDBX) $(CC) $(CFLAGS) -fPIC -c mdbx.c -o $@ -lmdb.o: $(SRC_LMDB) - $(CC) $(CFLAGS) -c mdb.c -o $@ - -lmdb.lo: $(SRC_LMDB) - $(CC) $(CFLAGS) -fPIC -c mdb.c -o $@ - %: %.o $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ -%.o: %.c lmdb.h mdbx.h +%.o: %.c mdbx.h $(CC) $(CFLAGS) -c $< COFLAGS = -fprofile-arcs -ftest-coverage @@ -188,13 +173,11 @@ endef $(eval $(call bench-rule,mdbx,$(NN),libmdbx.so)) -$(eval $(call bench-rule,lmdb,$(NN))) - $(eval $(call bench-rule,dummy,$(NN))) $(eval $(call bench-rule,debug,10)) -bench: bench-lmdb.txt bench-mdbx.txt +bench: bench-mdbx.txt endif diff --git a/README.md b/README.md index bac63b48..56f5ec31 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ RECLAIM` в _libmdbx_. сохранены мета-страницы со ссылками на страницы с новыми версиями данных, но не сами новые данные. В этом случае БД будет безвозвратна разрушена, даже если до аварии производилась -полная синхронизация данных (посредством `mdb_env_sync()`). +полная синхронизация данных (посредством `mdbx_env_sync()`). В _libmdbx_ эта проблема устранена, подробности ниже. @@ -248,7 +248,7 @@ RECLAIM` в _libmdbx_. сохранены мета-страницы со ссылками на страницы с новыми версиями данных, но не сами новые данные. В этом случае БД будет безвозвратна разрушена, даже если до аварии производилась - полная синхронизация данных (посредством `mdb_env_sync()`). + полная синхронизация данных (посредством `mdbx_env_sync()`). В _libmdbx_ эта проблема устранена путем полной переработки пути записи данных: @@ -371,5 +371,5 @@ RECLAIM` в _libmdbx_. 25. При завершении читающих транзакций, открытые в них DBI-хендлы не закрываются и не теряются при завершении таких транзакций посредством -mdb_txn_abort() или mdb_txn_reset(). Что позволяет избавится от ряда +mdbx_txn_abort() или mdbx_txn_reset(). Что позволяет избавится от ряда сложно обнаруживаемых ошибок. diff --git a/barriers.h b/barriers.h index ff39cae2..317e60bc 100644 --- a/barriers.h +++ b/barriers.h @@ -17,7 +17,7 @@ * in the most portable way for libmdbx project. * * Feedback and comments are welcome. - * https://gist.github.com/leo-yuriev/ba186a6bf5cf3a27bae7 */ + * https://gist.github.com/leo-yuriev/ba186a6bf5cf3a27bae7 */ #pragma once /* *INDENT-OFF* */ @@ -140,7 +140,7 @@ static MDBX_INLINE void mdbx_barrier(int type) { #define mdbx_coherent_barrier() \ mdbx_barrier(MDB_CACHE_IS_COHERENT ? MDBX_BARRIER_COMPILER : MDBX_BARRIER_MEMORY) -static MDBX_INLINE void mdb_invalidate_cache(void *addr, int nbytes) { +static MDBX_INLINE void mdbx_invalidate_cache(void *addr, int nbytes) { mdbx_coherent_barrier(); #if defined(__mips) && defined(__linux) /* MIPS has cache coherency issues. diff --git a/lmdb.h b/lmdb.h deleted file mode 100644 index 48ca616f..00000000 --- a/lmdb.h +++ /dev/null @@ -1,1557 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * - * This code is derived from "LMDB engine" written by - * Howard Chu (Symas Corporation), which itself derived from btree.c - * written by Martin Hedenfalk. - * - * --- - * - * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * --- - * - * Portions Copyright (c) 2009, 2010 Martin Hedenfalk - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef _LMDB_H_ -#define _LMDB_H_ - -#ifndef MDBX_MODE_ENABLED -# define MDBX_MODE_ENABLED 0 -#endif /* MDBX_MODE_ENABLED */ - -#include -#include -#include -#if MDBX_MODE_ENABLED -# include -# include -#endif /* MDBX_MODE_ENABLED */ - -#ifdef __cplusplus -extern "C" { -#endif - -/** An abstraction for a file handle. - * On POSIX systems file handles are small integers. - */ -typedef int mdb_filehandle_t; - -/** @defgroup mdb LMDB API - * @{ - * @brief OpenLDAP Lightning Memory-Mapped Database Manager - */ -/** @defgroup Version Version Macros - * @{ - */ -/** Library major version */ -#define MDB_VERSION_MAJOR 0 -/** Library minor version */ -#define MDB_VERSION_MINOR 9 -/** Library patch version */ -#define MDB_VERSION_PATCH 19 - -/** Combine args a,b,c into a single integer for easy version comparisons */ -#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) - -/** The full library version as a single integer */ -#define MDB_VERSION_FULL \ - MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) - -/** The release date of this library version */ -#define MDB_VERSION_DATE "DEVEL" - -/** A stringifier for the version info */ -#define MDB_VERSTR(a,b,c,d) "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" - -/** A helper for the stringifier macro */ -#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) - -/** The full library version as a C string */ -#define MDB_VERSION_STRING \ - MDB_VERFOO(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH,MDB_VERSION_DATE) -/** @} */ - -/** @brief Opaque structure for a database environment. - * - * A DB environment supports multiple databases, all residing in the same - * shared-memory map. - */ -typedef struct MDB_env MDB_env; - -/** @brief Opaque structure for a transaction handle. - * - * All database operations require a transaction handle. Transactions may be - * read-only or read-write. - */ -typedef struct MDB_txn MDB_txn; - -/** @brief A handle for an individual database in the DB environment. */ -typedef unsigned MDB_dbi; - -/** @brief Opaque structure for navigating through a database */ -typedef struct MDB_cursor MDB_cursor; - -/** @brief Generic structure used for passing keys and data in and out - * of the database. - * - * Values returned from the database are valid only until a subsequent - * update operation, or the end of the transaction. Do not modify or - * free them, they commonly point into the database itself. - * - * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. - * The same applies to data sizes in databases with the #MDB_DUPSORT flag. - * Other data items can in theory be from 0 to 0xffffffff bytes long. - */ -#if MDBX_MODE_ENABLED -typedef struct iovec MDB_val; -# define mv_size iov_len -# define mv_data iov_base -#else -typedef struct MDB_val { - size_t mv_size; /**< size of the data item */ - void *mv_data; /**< address of the data item */ -} MDB_val; -#endif /* MDBX_MODE_ENABLED */ - -/** @brief A callback function used to compare two keys in a database */ -typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); - -/** @brief A callback function used to relocate a position-dependent data item - * in a fixed-address database. - * - * The \b newptr gives the item's desired address in - * the memory map, and \b oldptr gives its previous address. The item's actual - * data resides at the address in \b item. This callback is expected to walk - * through the fields of the record in \b item and modify any - * values based at the \b oldptr address to be relative to the \b newptr address. - * @param[in,out] item The item that is to be relocated. - * @param[in] oldptr The previous address. - * @param[in] newptr The new address to relocate to. - * @param[in] relctx An application-provided context, set by #mdb_set_relctx(). - * @todo This feature is currently unimplemented. - */ -typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); - -/** @defgroup mdb_env Environment Flags - * @{ - */ - /** mmap at a fixed address (experimental) */ -#define MDB_FIXEDMAP 0x01 - /** no environment directory */ -#define MDB_NOSUBDIR 0x4000 - /** don't fsync after commit */ -#define MDB_NOSYNC 0x10000 - /** read only */ -#define MDB_RDONLY 0x20000 - /** don't fsync metapage after commit */ -#define MDB_NOMETASYNC 0x40000 - /** use writable mmap */ -#define MDB_WRITEMAP 0x80000 - /** use asynchronous msync when #MDB_WRITEMAP is used */ -#define MDB_MAPASYNC 0x100000 - /** tie reader locktable slots to #MDB_txn objects instead of to threads */ -#define MDB_NOTLS 0x200000 - /** don't do any locking, caller must manage their own locks - * WARNING: libmdbx don't support this mode. */ -#define MDB_NOLOCK__UNSUPPORTED 0x400000 - /** don't do readahead */ -#define MDB_NORDAHEAD 0x800000 - /** don't initialize malloc'd memory before writing to datafile */ -#define MDB_NOMEMINIT 0x1000000 - -#if MDBX_MODE_ENABLED - /** aim to coalesce FreeDB records */ -#define MDBX_COALESCE 0x2000000 - /** LIFO policy for reclaiming FreeDB records */ -#define MDBX_LIFORECLAIM 0x4000000 -#endif /* MDBX_MODE_ENABLED */ - - /** make a steady-sync only on close and explicit env-sync */ -#define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC|MDB_MAPASYNC) - /** debuging option, fill/perturb released pages */ -#define MDBX_PAGEPERTURB 0x8000000 -/** @} */ - -/** @defgroup mdb_dbi_open Database Flags - * @{ - */ - /** use reverse string keys */ -#define MDB_REVERSEKEY 0x02 - /** use sorted duplicates */ -#define MDB_DUPSORT 0x04 - /** numeric keys in native byte order, either unsigned int or #mdb_size_t. - * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdb_size_t.) - * The keys must all be of the same size. */ -#define MDB_INTEGERKEY 0x08 - /** with #MDB_DUPSORT, sorted dup items have fixed size */ -#define MDB_DUPFIXED 0x10 - /** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ -#define MDB_INTEGERDUP 0x20 - /** with #MDB_DUPSORT, use reverse string dups */ -#define MDB_REVERSEDUP 0x40 - /** create DB if not already existing */ -#define MDB_CREATE 0x40000 -/** @} */ - -/** @defgroup mdb_put Write Flags - * @{ - */ -/** For put: Don't write if the key already exists. */ -#define MDB_NOOVERWRITE 0x10 -/** Only for #MDB_DUPSORT
- * For put: don't write if the key and data pair already exist.
- * For mdb_cursor_del: remove all duplicate data items. - */ -#define MDB_NODUPDATA 0x20 -/** For mdb_cursor_put: overwrite the current key/data pair - * MDBX allows this flag for mdb_put() for explicit overwrite/update without insertion. */ -#define MDB_CURRENT 0x40 -/** For put: Just reserve space for data, don't copy it. Return a - * pointer to the reserved space. - */ -#define MDB_RESERVE 0x10000 -/** Data is being appended, don't split full pages. */ -#define MDB_APPEND 0x20000 -/** Duplicate data is being appended, don't split full pages. */ -#define MDB_APPENDDUP 0x40000 -/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ -#define MDB_MULTIPLE 0x80000 -/* @} */ - -/** @defgroup mdb_copy Copy Flags - * @{ - */ -/** Compacting copy: Omit free space from copy, and renumber all - * pages sequentially. - */ -#define MDB_CP_COMPACT 0x01 -/* @} */ - -/** @brief Cursor Get operations. - * - * This is the set of all operations for retrieving data - * using a cursor. - */ -typedef enum MDB_cursor_op { - MDB_FIRST, /**< Position at first key/data item */ - MDB_FIRST_DUP, /**< Position at first data item of current key. - Only for #MDB_DUPSORT */ - MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ - MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ - MDB_GET_CURRENT, /**< Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items - from current cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ - MDB_LAST, /**< Position at last key/data item */ - MDB_LAST_DUP, /**< Position at last data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT, /**< Position at next data item */ - MDB_NEXT_DUP, /**< Position at next data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items - from next cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ - MDB_NEXT_NODUP, /**< Position at first data item of next key */ - MDB_PREV, /**< Position at previous data item */ - MDB_PREV_DUP, /**< Position at previous data item of current key. - Only for #MDB_DUPSORT */ - MDB_PREV_NODUP, /**< Position at last data item of previous key */ - MDB_SET, /**< Position at specified key */ - MDB_SET_KEY, /**< Position at specified key, return key + data */ - MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ - MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to - a page of duplicate data items. Only for #MDB_DUPFIXED */ -} MDB_cursor_op; - -/** @defgroup errors Return Codes - * - * BerkeleyDB uses -30800 to -30999, we'll go under them - * @{ - */ - /** Successful result */ -#define MDB_SUCCESS 0 - /** key/data pair already exists */ -#define MDB_KEYEXIST (-30799) - /** key/data pair not found (EOF) */ -#define MDB_NOTFOUND (-30798) - /** Requested page not found - this usually indicates corruption */ -#define MDB_PAGE_NOTFOUND (-30797) - /** Located page was wrong type */ -#define MDB_CORRUPTED (-30796) - /** Update of meta page failed or environment had fatal error */ -#define MDB_PANIC (-30795) - /** Environment version mismatch */ -#define MDB_VERSION_MISMATCH (-30794) - /** File is not a valid LMDB file */ -#define MDB_INVALID (-30793) - /** Environment mapsize reached */ -#define MDB_MAP_FULL (-30792) - /** Environment maxdbs reached */ -#define MDB_DBS_FULL (-30791) - /** Environment maxreaders reached */ -#define MDB_READERS_FULL (-30790) - /** Txn has too many dirty pages */ -#define MDB_TXN_FULL (-30788) - /** Cursor stack too deep - internal error */ -#define MDB_CURSOR_FULL (-30787) - /** Page has not enough space - internal error */ -#define MDB_PAGE_FULL (-30786) - /** Database contents grew beyond environment mapsize */ -#define MDB_MAP_RESIZED (-30785) - /** Operation and DB incompatible, or DB type changed. This can mean: - *
    - *
  • The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. - *
  • Opening a named DB when the unnamed DB has #MDB_DUPSORT / #MDB_INTEGERKEY. - *
  • Accessing a data record as a database, or vice versa. - *
  • The database was dropped and recreated with different flags. - *
- */ -#define MDB_INCOMPATIBLE (-30784) - /** Invalid reuse of reader locktable slot */ -#define MDB_BAD_RSLOT (-30783) - /** Transaction must abort, has a child, or is invalid */ -#define MDB_BAD_TXN (-30782) - /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ -#define MDB_BAD_VALSIZE (-30781) - /** The specified DBI was changed unexpectedly */ -#define MDB_BAD_DBI (-30780) - /** Unexpected problem - txn should abort */ -#define MDB_PROBLEM (-30779) - /** The last defined error code */ -#define MDB_LAST_ERRCODE MDB_PROBLEM -/** @} */ - -/** @brief Statistics for a database in the environment */ -typedef struct MDB_stat { - unsigned ms_psize; /**< Size of a database page. - This is currently the same for all databases. */ - unsigned ms_depth; /**< Depth (height) of the B-tree */ - size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ - size_t ms_leaf_pages; /**< Number of leaf pages */ - size_t ms_overflow_pages; /**< Number of overflow pages */ - size_t ms_entries; /**< Number of data items */ -} MDB_stat; - -typedef struct MDBX_stat { - MDB_stat base; -#if MDBX_MODE_ENABLED - /* LY: TODO */ -#endif /* MDBX_MODE_ENABLED */ -} MDBX_stat; - -/** @brief Information about the environment */ -typedef struct MDB_envinfo { - void *me_mapaddr; /**< Address of map, if fixed */ - size_t me_mapsize; /**< Size of the data memory map */ - size_t me_last_pgno; /**< ID of the last used page */ - size_t me_last_txnid; /**< ID of the last committed transaction */ - unsigned me_maxreaders; /**< max reader slots in the environment */ - unsigned me_numreaders; /**< max reader slots used in the environment */ -} MDB_envinfo; - -typedef struct MDBX_envinfo { - MDB_envinfo base; -#if MDBX_MODE_ENABLED - size_t me_tail_txnid; /**< ID of the last reader transaction */ - size_t me_meta1_txnid, me_meta1_sign; - size_t me_meta2_txnid, me_meta2_sign; -#endif /* MDBX_MODE_ENABLED */ -} MDBX_envinfo; - - /** @brief Return the LMDB library version information. - * - * @param[out] major if non-NULL, the library major version number is copied here - * @param[out] minor if non-NULL, the library minor version number is copied here - * @param[out] patch if non-NULL, the library patch version number is copied here - * @retval "version string" The library version as a string - */ -char *mdb_version(int *major, int *minor, int *patch); - - /** @brief Return a string describing a given error code. - * - * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) - * function. If the error code is greater than or equal to 0, then the string - * returned by the system function strerror(3) is returned. If the error code - * is less than 0, an error string corresponding to the LMDB library error is - * returned. See @ref errors for a list of LMDB-specific error codes. - * @param[in] err The error code - * @retval "error message" The description of the error - */ -char *mdb_strerror(int err); - - /** @brief Create an LMDB environment handle. - * - * This function allocates memory for a #MDB_env structure. To release - * the allocated memory and discard the handle, call #mdb_env_close(). - * Before the handle may be used, it must be opened using #mdb_env_open(). - * Various other options may also need to be set before opening the handle, - * e.g. #mdb_env_set_mapsize(), #mdb_env_set_maxreaders(), #mdb_env_set_maxdbs(), - * depending on usage requirements. - * @param[out] env The address where the new handle will be stored - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_create(MDB_env **env); - - /** @brief Open an environment handle. - * - * If this function fails, #mdb_env_close() must be called to discard the #MDB_env handle. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] path The directory in which the database files reside. This - * directory must already exist and be writable. - * @param[in] flags Special options for this environment. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - * Flags set by mdb_env_set_flags() are also used. - *
    - *
  • #MDB_FIXEDMAP - * use a fixed address for the mmap region. This flag must be specified - * when creating the environment, and is stored persistently in the environment. - * If successful, the memory map will always reside at the same virtual address - * and pointers used to reference data items in the database will be constant - * across multiple invocations. This option may not always work, depending on - * how the operating system has allocated memory to shared libraries and other uses. - * The feature is highly experimental. - *
  • #MDB_NOSUBDIR - * By default, LMDB creates its environment in a directory whose - * pathname is given in \b path, and creates its data and lock files - * under that directory. With this option, \b path is used as-is for - * the database main data file. The database lock file is the \b path - * with "-lock" appended. - *
  • #MDB_RDONLY - * Open the environment in read-only mode. No write operations will be - * allowed. LMDB will still modify the lock file - except on read-only - * filesystems, where LMDB does not use locks. - *
  • #MDB_WRITEMAP - * Use a writeable memory map unless MDB_RDONLY is set. This uses - * fewer mallocs but loses protection from application bugs - * like wild pointer writes and other bad updates into the database. - * This may be slightly faster for DBs that fit entirely in RAM, but - * is slower for DBs larger than RAM. - * Incompatible with nested transactions. - * Do not mix processes with and without MDB_WRITEMAP on the same - * environment. This can defeat durability (#mdb_env_sync etc). - *
  • #MDB_NOMETASYNC - * Flush system buffers to disk only once per transaction, omit the - * metadata flush. Defer that until the system flushes files to disk, - * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization - * maintains database integrity, but a system crash may undo the last - * committed transaction. I.e. it preserves the ACI (atomicity, - * consistency, isolation) but not D (durability) database property. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_NOSYNC - * Don't flush system buffers to disk when committing a transaction. - * This optimization means a system crash can corrupt the database or - * lose the last transactions if buffers are not yet flushed to disk. - * The risk is governed by how often the system flushes dirty buffers - * to disk and how often #mdb_env_sync() is called. However, if the - * filesystem preserves write order and the #MDB_WRITEMAP flag is not - * used, transactions exhibit ACI (atomicity, consistency, isolation) - * properties and only lose D (durability). I.e. database integrity - * is maintained, but a system crash may undo the final transactions. - * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no - * hint for when to write transactions to disk, unless #mdb_env_sync() - * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_MAPASYNC - * When using #MDB_WRITEMAP, use asynchronous flushes to disk. - * As with #MDB_NOSYNC, a system crash can then corrupt the - * database or lose the last transactions. Calling #mdb_env_sync() - * ensures on-disk database integrity until next commit. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_NOTLS - * Don't use Thread-Local Storage. Tie reader locktable slots to - * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps - * the slot reseved for the #MDB_txn object. A thread may use parallel - * read-only transactions. A read-only transaction may span threads if - * the user synchronizes its use. Applications that multiplex many - * user threads over individual OS threads need this option. Such an - * application must also serialize the write transactions in an OS - * thread, since LMDB's write locking is unaware of the user threads. - *
  • #MDB_NOLOCK - * Don't do any locking. If concurrent access is anticipated, the - * caller must manage all concurrency itself. For proper operation - * the caller must enforce single-writer semantics, and must ensure - * that no readers are using old transactions while a writer is - * active. The simplest approach is to use an exclusive lock so that - * no readers may be active at all when a writer begins. - *
  • #MDB_NORDAHEAD - * Turn off readahead. Most operating systems perform readahead on - * read requests by default. This option turns it off if the OS - * supports it. Turning it off may help random read performance - * when the DB is larger than RAM and system RAM is full. - *
  • #MDB_NOMEMINIT - * Don't initialize malloc'd memory before writing to unused spaces - * in the data file. By default, memory for pages written to the data - * file is obtained using malloc. While these pages may be reused in - * subsequent transactions, freshly malloc'd pages will be initialized - * to zeroes before use. This avoids persisting leftover data from other - * code (that used the heap and subsequently freed the memory) into the - * data file. Note that many other system libraries may allocate - * and free memory from the heap for arbitrary uses. E.g., stdio may - * use the heap for file I/O buffers. This initialization step has a - * modest performance cost so some applications may want to disable - * it using this flag. This option can be a problem for applications - * which handle sensitive data like passwords, and it makes memory - * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP, - * which writes directly to the mmap instead of using malloc for pages. The - * initialization is also skipped if #MDB_RESERVE is used; the - * caller is expected to overwrite all of the memory that was - * reserved in that case. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDBX_COALESCE - * Aim to coalesce records while reclaiming FreeDB. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDBX_LIFORECLAIM - * LIFO policy for reclaiming FreeDB records. This significantly reduce - * write IPOS in case MDB_NOSYNC with periodically checkpoints. - *
- * @param[in] mode The UNIX permissions to set on created files and semaphores. - * This parameter is ignored on Windows. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the - * version that created the database environment. - *
  • #MDB_INVALID - the environment file headers are corrupted. - *
  • ENOENT - the directory specified by the path parameter doesn't exist. - *
  • EACCES - the user didn't have permission to access the environment files. - *
  • EAGAIN - the environment was locked by another process. - *
- */ -int mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); - /** @brief Copy an LMDB environment to the specified path. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copy(MDB_env *env, const char *path); - - /** @brief Copy an LMDB environment to the specified file descriptor. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); - - /** @brief Copy an LMDB environment to the specified path, with options. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * @param[in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free - * pages and sequentially renumber all pages in output. This option - * consumes more CPU and runs more slowly than the default. - * Currently it fails if the environment has suffered a page leak. - *
- * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copy2(MDB_env *env, const char *path, unsigned flags); - - /** @brief Copy an LMDB environment to the specified file descriptor, - * with options. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. See - * #mdb_env_copy2() for further details. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * @param[in] flags Special options for this operation. - * See #mdb_env_copy2() for options. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned flags); - - /** @brief Return statistics about the LMDB environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] stat The address of an #MDB_stat structure - * where the statistics will be copied - */ -int mdb_env_stat(MDB_env *env, MDB_stat *stat); - - /** @brief Return information about the LMDB environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] stat The address of an #MDB_envinfo structure - * where the information will be copied - */ -int mdb_env_info(MDB_env *env, MDB_envinfo *info); - - /** @brief Flush the data buffers to disk. - * - * Data is always written to disk when #mdb_txn_commit() is called, - * but the operating system may keep it buffered. LMDB always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is - * not valid if the environment was opened with #MDB_RDONLY. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] force If non-zero, force a synchronous flush. Otherwise - * if the environment has the #MDB_NOSYNC flag set the flushes - * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - the environment is read-only. - *
  • EINVAL - an invalid parameter was specified. - *
  • EIO - an error occurred during synchronization. - *
- */ -int mdb_env_sync(MDB_env *env, int force); - - /** @brief Close the environment and release the memory map. - * - * Only a single thread may call this function. All transactions, databases, - * and cursors must already be closed before calling this function. Attempts to - * use any such handles after calling this function will cause a SIGSEGV. - * The environment handle will be freed and must not be used again after this call. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dont_sync A dont'sync flag, if non-zero the last checkpoint - * (meta-page update) will be kept "as is" and may be still "weak" - * in NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored - * on opening next time, and transactions since the last non-weak - * checkpoint (meta-page update) will rolledback for consistency guarantee. - */ -void mdb_env_close(MDB_env *env); - - /** @brief Set environment flags. - * - * This may be used to set some flags in addition to those from - * #mdb_env_open(), or to unset these flags. If several threads - * change the flags at the same time, the result is undefined. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] flags The flags to change, bitwise OR'ed together - * @param[in] onoff A non-zero value sets the flags, zero clears them. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_set_flags(MDB_env *env, unsigned flags, int onoff); - - /** @brief Get environment flags. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] flags The address of an integer to store the flags - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_flags(MDB_env *env, unsigned *flags); - - /** @brief Return the path that was used in #mdb_env_open(). - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] path Address of a string pointer to contain the path. This - * is the actual string in the environment, not a copy. It should not be - * altered in any way. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_path(MDB_env *env, const char **path); - - /** @brief Return the filedescriptor for the given environment. - * - * This function may be called after fork(), so the descriptor can be - * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. - * (Until LMDB 0.9.18, only the lockfile had that.) - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); - - /** @brief Set the size of the memory map to use for this environment. - * - * The size should be a multiple of the OS page size. The default is - * 10485760 bytes. The size of the memory map is also the maximum size - * of the database. The value should be chosen as large as possible, - * to accommodate future growth of the database. - * This function should be called after #mdb_env_create() and before #mdb_env_open(). - * It may be called at later times if no transactions are active in - * this process. Note that the library does not check for this condition, - * the caller must ensure it explicitly. - * - * The new size takes effect immediately for the current process but - * will not be persisted to any others until a write transaction has been - * committed by the current process. Also, only mapsize increases are - * persisted into the environment. - * - * If the mapsize is increased by another process, and data has grown - * beyond the range of the current mapsize, #mdb_txn_begin() will - * return #MDB_MAP_RESIZED. This function may be called with a size - * of zero to adopt the new size. - * - * Any attempt to set a size smaller than the space already consumed - * by the environment will be silently changed to the current size of the used space. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] size The size in bytes - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment has - * an active write transaction. - *
- */ -int mdb_env_set_mapsize(MDB_env *env, size_t size); - - /** @brief Set the maximum number of threads/reader slots for the environment. - * - * This defines the number of slots in the lock table that is used to track readers in the - * the environment. The default is 126. - * Starting a read-only transaction normally ties a lock table slot to the - * current thread until the environment closes or the thread exits. If - * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the - * MDB_txn object until it or the #MDB_env object is destroyed. - * This function may only be called after #mdb_env_create() and before #mdb_env_open(). - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] readers The maximum number of reader lock table slots - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is already open. - *
- */ -int mdb_env_set_maxreaders(MDB_env *env, unsigned readers); - - /** @brief Get the maximum number of threads/reader slots for the environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] readers Address of an integer to store the number of readers - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_maxreaders(MDB_env *env, unsigned *readers); - - /** @brief Set the maximum number of named databases for the environment. - * - * This function is only needed if multiple databases will be used in the - * environment. Simpler applications that use the environment as a single - * unnamed database can ignore this option. - * This function may only be called after #mdb_env_create() and before #mdb_env_open(). - * - * Currently a moderate number of slots are cheap but a huge number gets - * expensive: 7-120 words per transaction, and every #mdb_dbi_open() - * does a linear search of the opened slots. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dbs The maximum number of databases - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is already open. - *
- */ -int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); - - /** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. - * - * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. - * See @ref MDB_val. - * @param[in] env An environment handle returned by #mdb_env_create() - * @return The maximum size of a key we can write - */ -int mdb_env_get_maxkeysize(MDB_env *env); - - /** @brief Set application information associated with the #MDB_env. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_set_userctx(MDB_env *env, void *ctx); - - /** @brief Get the application information associated with the #MDB_env. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @return The pointer set by #mdb_env_set_userctx(). - */ -void *mdb_env_get_userctx(MDB_env *env); - - /** @brief A callback function for most LMDB assert() failures, - * called before printing the message and aborting. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] msg The assertion message, not including newline. - */ -typedef void MDB_assert_func(MDB_env *env, const char *msg, - const char *function, unsigned line); - - /** Set or reset the assert() callback of the environment. - * Disabled if liblmdb is buillt with MDB_DEBUG=0. - * @note This hack should become obsolete as lmdb's error handling matures. - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] func An #MDB_assert_func function, or 0. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); - - /** @brief Create a transaction for use with the environment. - * - * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). - * @note A transaction and its cursors must only be used by a single - * thread, and a thread may only have a single transaction at a time. - * If #MDB_NOTLS is in use, this does not apply to read-only transactions. - * @note Cursors may not span transactions. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] parent If this parameter is non-NULL, the new transaction - * will be a nested transaction, with the transaction indicated by \b parent - * as its parent. Transactions may be nested to any level. A parent - * transaction and its cursors may not issue any other operations than - * mdb_txn_commit and mdb_txn_abort while it has active child transactions. - * @param[in] flags Special options for this transaction. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_RDONLY - * This transaction will not perform any write operations. - *
- * @param[out] txn Address where the new #MDB_txn handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's - * mapsize and this environment's map must be resized as well. - * See #mdb_env_set_mapsize(). - *
  • #MDB_READERS_FULL - a read-only transaction was requested and - * the reader lock table is full. See #mdb_env_set_maxreaders(). - *
  • ENOMEM - out of memory. - *
- */ -int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **txn); - - /** @brief Returns the transaction's #MDB_env - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -MDB_env *mdb_txn_env(MDB_txn *txn); - - /** @brief Return the transaction's ID. - * - * This returns the identifier associated with this transaction. For a - * read-only transaction, this corresponds to the snapshot being read; - * concurrent readers will frequently have the same transaction ID. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A transaction ID, valid if input is an active transaction. - */ -size_t mdb_txn_id(MDB_txn *txn); - - /** @brief Commit all the operations of a transaction into the database. - * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdb_cursor_renew(). - * - * @note MDBX-mode: - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * - * @note LMDB-compatible mode: - * Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
  • ENOSPC - no more disk space. - *
  • EIO - a low-level I/O error occurred while writing. - *
  • ENOMEM - out of memory. - *
- */ -int mdb_txn_commit(MDB_txn *txn); - - /** @brief Abandon all the operations of the transaction instead of saving them. - * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdb_cursor_renew(). - * - * @note MDBX-mode: - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * - * @note LMDB-compatible mode: - * Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -int mdb_txn_abort(MDB_txn *txn); - - /** @brief Reset a read-only transaction. - * - * Abort the transaction like #mdb_txn_abort(), but keep the transaction - * handle. #mdb_txn_renew() may reuse the handle. This saves allocation - * overhead if the process will start a new read-only transaction soon, - * and also locking overhead if #MDB_NOTLS is in use. The reader table - * lock is released, but the table slot stays tied to its thread or - * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free - * its lock table slot if MDB_NOTLS is in use. - * Cursors opened within the transaction must not be used - * again after this call, except with #mdb_cursor_renew(). - * Reader locks generally don't interfere with writers, but they keep old - * versions of database pages allocated. Thus they prevent the old pages - * from being reused when writers commit new data, and so under heavy load - * the database size may grow much more rapidly than otherwise. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -int mdb_txn_reset(MDB_txn *txn); - - /** @brief Renew a read-only transaction. - * - * This acquires a new reader lock for a transaction handle that had been - * released by #mdb_txn_reset(). It must be called before a reset transaction - * may be used again. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_txn_renew(MDB_txn *txn); - -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdb_open(txn,name,flags,dbi) mdb_dbi_open(txn,name,flags,dbi) -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdb_close(env,dbi) mdb_dbi_close(env,dbi) - - /** @brief Open a database in the environment. - * - * A database handle denotes the name and parameters of a database, - * independently of whether such a database exists. - * The database handle may be discarded by calling #mdb_dbi_close(). - * The old database handle is returned if the database was already open. - * The handle may only be closed once. - * - * The database handle will be private to the current transaction until - * the transaction is successfully committed. If the transaction is - * aborted the handle will be closed automatically. - * After a successful commit the handle will reside in the shared - * environment, and may be used by other transactions. - * - * This function must not be called from multiple concurrent - * transactions in the same process. A transaction that uses - * this function must finish (either commit or abort) before - * any other transaction in the process may use this function. - * - * To use named databases (with name != NULL), #mdb_env_set_maxdbs() - * must be called before opening the environment. Database names are - * keys in the unnamed database, and may be read but not written. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] name The name of the database to open. If only a single - * database is needed in the environment, this value may be NULL. - * @param[in] flags Special options for this database. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_REVERSEKEY - * Keys are strings to be compared in reverse order, from the end - * of the strings to the beginning. By default, Keys are treated as strings and - * compared from beginning to end. - *
  • #MDB_DUPSORT - * Duplicate keys may be used in the database. (Or, from another perspective, - * keys may have multiple data items, stored in sorted order.) By default - * keys must be unique and may have only a single data item. - *
  • #MDB_INTEGERKEY - * Keys are binary integers in native byte order, either unsigned int - * or #mdb_size_t, and will be sorted as such. - * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdb_size_t.) - * The keys must all be of the same size. - *
  • #MDB_DUPFIXED - * This flag may only be used in combination with #MDB_DUPSORT. This option - * tells the library that the data items for this database are all the same - * size, which allows further optimizations in storage and retrieval. When - * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE - * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple - * items at once. - *
  • #MDB_INTEGERDUP - * This option specifies that duplicate data items are binary integers, - * similar to #MDB_INTEGERKEY keys. - *
  • #MDB_REVERSEDUP - * This option specifies that duplicate data items should be compared as - * strings in reverse order. - *
  • #MDB_CREATE - * Create the named database if it doesn't exist. This option is not - * allowed in a read-only transaction or a read-only environment. - *
- * @param[out] dbi Address where the new #MDB_dbi handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - the specified database doesn't exist in the environment - * and #MDB_CREATE was not specified. - *
  • #MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). - *
- */ -int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); - - /** @brief Retrieve statistics for a database. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] stat The address of an #MDB_stat structure - * where the statistics will be copied - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); - - /** @brief Retrieve the DB flags for a database handle. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] flags Address where the flags will be returned. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); - - /** @brief Close a database handle. Normally unnecessary. Use with care: - * - * This call is not mutex protected. Handles should only be closed by - * a single thread, and only if no other threads are going to reference - * the database handle or one of its cursors any further. Do not close - * a handle if an existing transaction has modified its database. - * Doing so can cause misbehavior from database corruption to errors - * like MDB_BAD_VALSIZE (since the DB name is gone). - * - * Closing a database handle is not necessary, but lets #mdb_dbi_open() - * reuse the handle value. Usually it's better to set a bigger - * #mdb_env_set_maxdbs(), unless that value would be large. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - */ -void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); - - /** @brief Empty or delete+close a database. - * - * See #mdb_dbi_close() for restrictions about closing the DB handle. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] del 0 to empty the DB, 1 to delete it from the - * environment and close the DB handle. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); - - /** @brief Set a custom key comparison function for a database. - * - * The comparison function is called whenever it is necessary to compare a - * key specified by the application with a key currently stored in the database. - * If no comparison function is specified, and no special key flags were specified - * with #mdb_dbi_open(), the keys are compared lexically, with shorter keys collating - * before longer keys. - * @warning This function must be called before any data access functions are used, - * otherwise data corruption may occur. The same comparison function must be used by every - * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - - /** @brief Set a custom data comparison function for a #MDB_DUPSORT database. - * - * This comparison function is called whenever it is necessary to compare a data - * item specified by the application with a data item currently stored in the database. - * This function only takes effect if the database was opened with the #MDB_DUPSORT - * flag. - * If no comparison function is specified, and no special key flags were specified - * with #mdb_dbi_open(), the data items are compared lexically, with shorter items collating - * before longer items. - * @warning This function must be called before any data access functions are used, - * otherwise data corruption may occur. The same comparison function must be used by every - * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - - /** @brief Set a relocation function for a #MDB_FIXEDMAP database. - * - * @todo The relocation function is called whenever it is necessary to move the data - * of an item to a different position in the database (e.g. through tree - * balancing operations, shifts as a result of adds or deletes, etc.). It is - * intended to allow address/position-dependent data items to be stored in - * a database in an environment opened with the #MDB_FIXEDMAP option. - * Currently the relocation feature is unimplemented and setting - * this function has no effect. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] rel A #MDB_rel_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); - - /** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation function. - * - * See #mdb_set_relfunc and #MDB_rel_func for more details. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * It will be passed to the callback function set by #mdb_set_relfunc - * as its \b relctx parameter whenever the callback is invoked. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); - - /** @brief Get items from a database. - * - * This function retrieves key/data pairs from the database. The address - * and length of the data associated with the specified \b key are returned - * in the structure to which \b data refers. - * If the database supports duplicate keys (#MDB_DUPSORT) then the - * first data item for the key will be returned. Retrieval of other - * items requires the use of #mdb_cursor_get(). - * - * @note The memory pointed to by the returned values is owned by the - * database. The caller need not dispose of the memory, and may not - * modify it in any way. For values returned in a read-only transaction - * any modification attempts will cause a SIGSEGV. - * @note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to search for in the database - * @param[out] data The data corresponding to the key - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - the key was not in the database. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); - - /** @brief Store items into a database. - * - * This function stores key/data pairs in the database. The default behavior - * is to enter the new key/data pair, replacing any previously existing key - * if duplicates are disallowed, or adding a duplicate data item if - * duplicates are allowed (#MDB_DUPSORT). - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to store in the database - * @param[in,out] data The data to store - * @param[in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be specified - * if the database was opened with #MDB_DUPSORT. The function will - * return #MDB_KEYEXIST if the key/data pair already appears in the - * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDB_KEYEXIST if the key already appears in the database, even if - * the database supports duplicates (#MDB_DUPSORT). The \b data - * parameter will be set to point to the existing item. - *
  • #MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. - * LMDB does nothing else with this memory, the caller is expected - * to modify all of the space requested. This flag must not be - * specified if the database was opened with #MDB_DUPSORT. - *
  • #MDB_APPEND - append the given key/data pair to the end of the - * database. This option allows fast bulk loading when keys are - * already known to be in the correct order. Loading unsorted keys - * with this flag will cause a #MDB_KEYEXIST error. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, - unsigned flags); - - /** @brief Delete items from a database. - * - * This function removes key/data pairs from the database. - * - * MDBX-mode: - * The data parameter is NOT ignored regardless the database does - * support sorted duplicate data items or not. If the data parameter - * is non-NULL only the matching data item will be deleted. - * - * LMDB-compatible mode: - * If the database does not support sorted duplicate data items - * (#MDB_DUPSORT) the data parameter is ignored. - * If the database supports sorted duplicates and the data parameter - * is NULL, all of the duplicate data items for the key will be - * deleted. Otherwise, if the data parameter is non-NULL - * only the matching data item will be deleted. - * - * This function will return #MDB_NOTFOUND if the specified key/data - * pair is not in the database. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to delete from the database - * @param[in] data The data to delete - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); - - /** @brief Create a cursor handle. - * - * A cursor is associated with a specific transaction and database. - * A cursor cannot be used when its database handle is closed. Nor - * when its transaction has ended, except with #mdb_cursor_renew(). - * It can be discarded with #mdb_cursor_close(). - * - * MDBX-mode: - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * - * LMDB-compatible mode: - * A cursor in a write-transaction can be closed before its transaction - * ends, and will otherwise be closed when its transaction ends. - * A cursor in a read-only transaction must be closed explicitly, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * @note Earlier documentation said that cursors in every transaction - * were closed when the transaction committed or aborted. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] cursor Address where the new #MDB_cursor handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); - - /** @brief Close a cursor handle. - * - * The cursor handle will be freed and must not be used again after this call. - * Its transaction must still be live if it is a write-transaction. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -void mdb_cursor_close(MDB_cursor *cursor); - - /** @brief Renew a cursor handle. - * - * A cursor is associated with a specific transaction and database. - * Cursors that are only used in read-only - * transactions may be re-used, to avoid unnecessary malloc/free overhead. - * The cursor may be associated with a new read-only transaction, and - * referencing the same database handle as it was created with. - * This may be done whether the previous transaction is live or dead. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); - - /** @brief Return the cursor's transaction handle. - * - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); - - /** @brief Return the cursor's database handle. - * - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); - - /** @brief Retrieve by cursor. - * - * This function retrieves key/data pairs from the database. The address and length - * of the key are returned in the object to which \b key refers (except for the - * case of the #MDB_SET option, in which the \b key object is unchanged), and - * the address and length of the data are returned in the object to which \b data - * refers. - * See #mdb_get() for restrictions on using the output values. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in,out] key The key for a retrieved item - * @param[in,out] data The data of a retrieved item - * @param[in] op A cursor operation #MDB_cursor_op - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - no matching key found. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - MDB_cursor_op op); - - /** @brief Store by cursor. - * - * This function stores key/data pairs into the database. - * The cursor is positioned at the new item, or on failure usually near it. - * @note Earlier documentation incorrectly said errors would leave the - * state of the cursor unchanged. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in] key The key operated on. - * @param[in] data The data operated on. - * @param[in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_CURRENT - replace the item at the current cursor position. - * The \b key parameter must still be provided, and must match it. - * If using sorted duplicates (#MDB_DUPSORT) the data item must still - * sort into the same place. This is intended to be used when the - * new data is the same size as the old. Otherwise it will simply - * perform a delete of the old record followed by an insert. - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be specified - * if the database was opened with #MDB_DUPSORT. The function will - * return #MDB_KEYEXIST if the key/data pair already appears in the - * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDB_KEYEXIST if the key already appears in the database, even if - * the database supports duplicates (#MDB_DUPSORT). - *
  • #MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. This flag - * must not be specified if the database was opened with #MDB_DUPSORT. - *
  • #MDB_APPEND - append the given key/data pair to the end of the - * database. No key comparisons are performed. This option allows - * fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * a #MDB_KEYEXIST error. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a - * single request. This flag may only be specified if the database - * was opened with #MDB_DUPFIXED. The \b data argument must be an - * array of two MDB_vals. The mv_size of the first MDB_val must be - * the size of a single data element. The mv_data of the first MDB_val - * must point to the beginning of the array of contiguous data elements. - * The mv_size of the second MDB_val must be the count of the number - * of data elements to store. On return this field will be set to - * the count of the number of elements actually written. The mv_data - * of the second MDB_val is unused. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - unsigned flags); - - /** @brief Delete current key/data pair - * - * This function deletes the key/data pair to which the cursor refers. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_NODUPDATA - delete all of the data items for the current key. - * This flag may only be specified if the database was opened with #MDB_DUPSORT. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_del(MDB_cursor *cursor, unsigned flags); - - /** @brief Return count of duplicates for current key. - * - * This call is only valid on databases that support sorted duplicate - * data items #MDB_DUPSORT. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[out] countp Address where the count will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. - *
- */ -int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); - - /** @brief Compare two data items according to a particular database. - * - * This returns a comparison as if the two data items were keys in the - * specified database. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b - */ -int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); - - /** @brief Compare two data items according to a particular database. - * - * This returns a comparison as if the two items were data items of - * the specified database. The database must have the #MDB_DUPSORT flag. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b - */ -int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); - - /** @brief A callback function used to print a message from the library. - * - * @param[in] msg The string to be printed. - * @param[in] ctx An arbitrary context pointer for the callback. - * @return < 0 on failure, >= 0 on success. - */ -typedef int (MDB_msg_func)(const char *msg, void *ctx); - - /** @brief Dump the entries in the reader lock table. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] func A #MDB_msg_func function - * @param[in] ctx Anything the message function needs - * @return < 0 on failure, >= 0 on success. - */ -int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); - - /** @brief Check for stale entries in the reader lock table. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] dead Number of stale slots that were cleared - * @return 0 on success, non-zero on failure. - */ -int mdb_reader_check(MDB_env *env, int *dead); -/** @} */ - -char* mdb_dkey(MDB_val *key, char *buf); - -#ifdef __cplusplus -} -#endif -/** @page tools LMDB Command Line Tools - The following describes the command line tools that are available for LMDB. - \li \ref mdb_chk_1 - \li \ref mdb_copy_1 - \li \ref mdb_dump_1 - \li \ref mdb_load_1 - \li \ref mdb_stat_1 -*/ - -#endif /* _LMDB_H_ */ diff --git a/mdb.c b/mdb.c deleted file mode 100644 index c26cf0d1..00000000 --- a/mdb.c +++ /dev/null @@ -1,10723 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * - * This code is derived from "LMDB engine" written by - * Howard Chu (Symas Corporation), which itself derived from btree.c - * written by Martin Hedenfalk. - * - * --- - * - * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * --- - * - * Portions Copyright (c) 2009, 2010 Martin Hedenfalk - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef MDB_DEBUG -# define MDB_DEBUG 0 -#endif - -#ifndef _GNU_SOURCE -# define _GNU_SOURCE -#endif - -/* LY: Please do not ask us for Windows support, just never! - * But you can make a fork for Windows, or become maintainer for FreeBSD... */ -#ifndef __gnu_linux__ -# warning "libmdbx supports only GNU Linux" -#endif - -#include - -#if !defined(__GNUC__) || !__GNUC_PREREQ(4,2) - /* LY: Actualy libmdbx was not tested with compilers - * older than GCC 4.4 (from RHEL6). - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old compilers. - */ -# warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." -#endif - -#if !defined(__GNU_LIBRARY__) || !__GLIBC_PREREQ(2,12) - /* LY: Actualy libmdbx was not tested with something - * older than glibc 2.12 (from RHEL6). - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old systems. - */ -# warning "libmdbx required at least GLIBC 2.12." -#endif - -#if MDB_DEBUG -# undef NDEBUG -#endif - -#include "./reopen.h" -#include "./barriers.h" - -#include -#include -#include -#include -#include -#ifdef HAVE_SYS_FILE_H -# include -#endif -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) -# include -# include /* defines BYTE_ORDER on HPUX and Solaris */ -#endif - -#ifndef _POSIX_SYNCHRONIZED_IO -# define fdatasync fsync -#endif - -#ifndef BYTE_ORDER -# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) - /* Solaris just defines one or the other */ -# define LITTLE_ENDIAN 1234 -# define BIG_ENDIAN 4321 -# ifdef _LITTLE_ENDIAN -# define BYTE_ORDER LITTLE_ENDIAN -# else -# define BYTE_ORDER BIG_ENDIAN -# endif -# else -# define BYTE_ORDER __BYTE_ORDER -# endif -#endif - -#ifndef LITTLE_ENDIAN -# define LITTLE_ENDIAN __LITTLE_ENDIAN -#endif -#ifndef BIG_ENDIAN -# define BIG_ENDIAN __BIG_ENDIAN -#endif - -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -# define MISALIGNED_OK 1 -#endif - -#include "./lmdb.h" -#include "./midl.h" - -#if ! MDBX_MODE_ENABLED -# define MDBX_COALESCE 0 -# define MDBX_LIFORECLAIM 0 -# define MDBX_DBG_ASSERT 0 -# define MDBX_DBG_PRINT 0 -# define MDBX_DBG_TRACE 0 -# define MDBX_DBG_EXTRA 0 -# define MDBX_DBG_AUDIT 0 -# define MDBX_DBG_EDGE 0 -# define mdb_runtime_flags 0 -# define mdb_debug_logger ((void (*)(int, ...)) NULL) -# define MDBX_ONLY_FEATURE static -#else -# define MDBX_ONLY_FEATURE -#endif /* ! MDBX_MODE_ENABLED */ - -#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) -# error "Unknown or unsupported endianness (BYTE_ORDER)" -#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -# error "Two's complement, reasonably sized integer types, please" -#endif - -/** @defgroup internal LMDB Internals - * @{ - */ -/** @defgroup compat Compatibility Macros - * A bunch of macros to minimize the amount of platform-specific ifdefs - * needed throughout the rest of the code. When the features this library - * needs are similar enough to POSIX to be hidden in a one-or-two line - * replacement, this macro approach is used. - * @{ - */ - - /** Features under development */ -#ifndef MDB_DEVEL -# define MDB_DEVEL 0 -#endif - - /** Wrapper around __func__, which is a C99 feature */ -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -# define mdb_func_ __func__ -#elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) -# define mdb_func_ __FUNCTION__ -#else - /* If a debug message says (), update the #if statements above */ -# define mdb_func_ "" -#endif - -/** Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDB_USE_ROBUST=0. - */ -#ifndef MDB_USE_ROBUST - /* Howard Chu: Android currently lacks Robust Mutex support */ -# if defined(EOWNERDEAD) && !defined(ANDROID) \ - /* LY: glibc before 2.10 has a troubles with Robust Mutex too. */ \ - && __GLIBC_PREREQ(2,10) -# define MDB_USE_ROBUST 1 -# else -# define MDB_USE_ROBUST 0 -# endif -#endif /* MDB_USE_ROBUST */ - -/* Internal error codes, not exposed outside liblmdb */ -#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) - - /** Mutex for the reader table (rw = r) or write transaction (rw = w). - */ -#define MDB_MUTEX(env, rw) \ - (&(env)->me_txns->mti_##rw##mutex) - - /** An abstraction for a file handle. - * On POSIX systems file handles are small integers. On Windows - * they're opaque pointers. - */ -#define HANDLE int - - /** A value for an invalid file handle. - * Mainly used to initialize file variables and signify that they are - * unused. - */ -#define INVALID_HANDLE_VALUE (-1) - - /** Get the size of a memory page for the system. - * This is the basic size that the platform's memory manager uses, and is - * fundamental to the use of memory-mapped files. - */ -#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) - -/** @} */ - -static int mdb_mutex_lock(MDB_env *env, pthread_mutex_t *mutex); -static int mdb_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); -static void mdb_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex); - - /** A page number in the database. - * Note that 64 bit page numbers are overkill, since pages themselves - * already represent 12-13 bits of addressable memory, and the OS will - * always limit applications to a maximum of 63 bits of address space. - * - * @note In the #MDB_node structure, we only store 48 bits of this value, - * which thus limits us to only 60 bits of addressable data. - */ -typedef MDB_ID pgno_t; - - /** A transaction ID. - * See struct MDB_txn.mt_txnid for details. - */ -typedef MDB_ID txnid_t; - -/** @defgroup debug Debug Macros - * @{ - */ - /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ -#define DDBI(mc) \ - (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) -/** @} */ - - /** @brief The maximum size of a database page. - * - * It is 32k or 64k, since value-PAGEBASE must fit in - * #MDB_page.%mp_upper. - * - * LMDB will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. - */ -#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) - - /** The minimum number of keys required in a database page. - * Setting this to a larger value will place a smaller bound on the - * maximum size of a data item. Data items larger than this size will - * be pushed into overflow pages instead of being stored directly in - * the B-tree node. This value used to default to 4. With a page size - * of 4096 bytes that meant that any item larger than 1024 bytes would - * go into an overflow page. That also meant that on average 2-3KB of - * each overflow page was wasted space. The value cannot be lower than - * 2 because then there would no longer be a tree structure. With this - * value, items larger than 2KB will go into overflow pages, and on - * average only 1KB will be wasted. - */ -#define MDB_MINKEYS 2 - - /** A stamp that identifies a file as an LMDB file. - * There's nothing special about this value other than that it is easily - * recognizable, and it will reflect any byte order mismatches. - */ -#define MDB_MAGIC 0xBEEFC0DE - - /** The version number for a database's datafile format. */ -#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) - /** The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) - - /** @brief The max size of a key we can write, or 0 for computed max. - * - * This macro should normally be left alone or set to 0. - * Note that a database with big keys or dupsort data cannot be - * reliably modified by a liblmdb which uses a smaller max. - * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. - * - * Other values are allowed, for backwards compat. However: - * A value bigger than the computed max can break if you do not - * know what you are doing, and liblmdb <= 0.9.10 can break when - * modifying a DB with keys/dupsort data bigger than its max. - * - * Data items in an #MDB_DUPSORT database are also limited to - * this size, since they're actually keys of a sub-DB. Keys and - * #MDB_DUPSORT data items must fit on a node in a regular page. - */ -#ifndef MDB_MAXKEYSIZE -# define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) -#endif - - /** The maximum size of a key we can write to the environment. */ -#if MDB_MAXKEYSIZE -# define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) -#else -# define ENV_MAXKEY(env) ((env)->me_maxkey_limit) -#endif /* MDB_MAXKEYSIZE */ - - /** @brief The maximum size of a data item. - * - * We only store a 32 bit value for node sizes. - */ -#define MAXDATASIZE 0xffffffffUL - - /** Key size which fits in a #DKBUF. - * @ingroup debug - */ -#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) - /** A key buffer. - * @ingroup debug - * This is used for printing a hex dump of a key's contents. - */ -#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] - /** Display a key in hex. - * @ingroup debug - * Invoke a function to display a key in hex. - */ -#define DKEY(x) mdb_dkey(x, kbuf) - - /** An invalid page number. - * Mainly used to denote an empty tree. - */ -#define P_INVALID (~(pgno_t)0) - - /** Test if the flags \b f are set in a flag word \b w. */ -#define F_ISSET(w, f) (((w) & (f)) == (f)) - - /** Round \b n up to an even number. */ -#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ - - /** Used for offsets within a single page. - * Since memory pages are typically 4 or 8KB in size, 12-13 bits, - * this is plenty. - */ -typedef uint16_t indx_t; - - /** Default size of memory map. - * This is certainly too small for any actual applications. Apps should always set - * the size explicitly using #mdb_env_set_mapsize(). - */ -#define DEFAULT_MAPSIZE 1048576 - -/** @defgroup readers Reader Lock Table - * Readers don't acquire any locks for their data access. Instead, they - * simply record their transaction ID in the reader table. The reader - * mutex is needed just to find an empty slot in the reader table. The - * slot's address is saved in thread-specific data so that subsequent read - * transactions started by the same thread need no further locking to proceed. - * - * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. - * - * No reader table is used if the database is on a read-only filesystem, or - * if #MDB_NOLOCK is set. - * - * Since the database uses multi-version concurrency control, readers don't - * actually need any locking. This table is used to keep track of which - * readers are using data from which old transactions, so that we'll know - * when a particular old transaction is no longer in use. Old transactions - * that have discarded any data pages can then have those pages reclaimed - * for use by a later write transaction. - * - * The lock table is constructed such that reader slots are aligned with the - * processor's cache line size. Any slot is only ever used by one thread. - * This alignment guarantees that there will be no contention or cache - * thrashing as threads update their own slot info, and also eliminates - * any need for locking when accessing a slot. - * - * A writer thread will scan every slot in the table to determine the oldest - * outstanding reader transaction. Any freed pages older than this will be - * reclaimed by the writer. The writer doesn't use any locks when scanning - * this table. This means that there's no guarantee that the writer will - * see the most up-to-date reader info, but that's not required for correct - * operation - all we need is to know the upper bound on the oldest reader, - * we don't care at all about the newest reader. So the only consequence of - * reading stale information here is that old pages might hang around a - * while longer before being reclaimed. That's actually good anyway, because - * the longer we delay reclaiming old pages, the more likely it is that a - * string of contiguous pages can be found after coalescing old pages from - * many old transactions together. - * @{ - */ - /** Number of slots in the reader table. - * This value was chosen somewhat arbitrarily. 126 readers plus a - * couple mutexes fit exactly into 8KB on my development machine. - * Applications should set the table size using #mdb_env_set_maxreaders(). - */ -#define DEFAULT_READERS 126 - - /** The information we store in a single slot of the reader table. - * In addition to a transaction ID, we also record the process and - * thread ID that owns a slot, so that we can detect stale information, - * e.g. threads or processes that went away without cleaning up. - * @note We currently don't check for stale records. We simply re-init - * the table when we know that we're the only process opening the - * lock file. - */ -typedef struct MDB_rxbody { - /** Current Transaction ID when this transaction began, or (txnid_t)-1. - * Multiple readers that start at the same time will probably have the - * same ID here. Again, it's not important to exclude them from - * anything; all we need to know is which version of the DB they - * started from so we can avoid overwriting any data used in that - * particular version. - */ - volatile txnid_t mrb_txnid; - /** The process ID of the process owning this reader txn. */ - volatile pid_t mrb_pid; - /** The thread ID of the thread owning this txn. */ - volatile pthread_t mrb_tid; -} MDB_rxbody; - - /** The actual reader record, with cacheline padding. */ -typedef struct MDB_reader { - union { - MDB_rxbody mrx; - /** shorthand for mrb_txnid */ -#define mr_txnid mru.mrx.mrb_txnid -#define mr_pid mru.mrx.mrb_pid -#define mr_tid mru.mrx.mrb_tid - /** cache line alignment */ - char pad[(sizeof(MDB_rxbody)+CACHELINE_SIZE-1) & ~(CACHELINE_SIZE-1)]; - } mru; -} MDB_reader; - - /** The header for the reader table. - * The table resides in a memory-mapped file. (This is a different file - * than is used for the main database.) - * - * For POSIX the actual mutexes reside in the shared memory of this - * mapped file. On Windows, mutexes are named objects allocated by the - * kernel; we store the mutex names in this mapped file so that other - * processes can grab them. This same approach is also used on - * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support - * process-shared POSIX mutexes. For these cases where a named object - * is used, the object name is derived from a 64 bit FNV hash of the - * environment pathname. As such, naming collisions are extremely - * unlikely. If a collision occurs, the results are unpredictable. - */ -typedef struct MDB_txbody { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mtb_magic; - /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ - uint32_t mtb_format; - /** Mutex protecting access to this table. - * This is the #MDB_MUTEX(env,r) reader table lock. - */ - pthread_mutex_t mtb_rmutex; - /** The ID of the last transaction committed to the database. - * This is recorded here only for convenience; the value can always - * be determined by reading the main database meta pages. - */ - volatile txnid_t mtb_txnid; - /** The number of slots that have been used in the reader table. - * This always records the maximum count, it is not decremented - * when readers release their slots. - */ - volatile unsigned mtb_numreaders; -} MDB_txbody; - - /** The actual reader table definition. */ -typedef struct MDB_txninfo { - union { - MDB_txbody mtb; -#define mti_magic mt1.mtb.mtb_magic -#define mti_format mt1.mtb.mtb_format -#define mti_rmutex mt1.mtb.mtb_rmutex -#define mti_rmname mt1.mtb.mtb_rmname -#define mti_txnid mt1.mtb.mtb_txnid -#define mti_numreaders mt1.mtb.mtb_numreaders - char pad[(sizeof(MDB_txbody)+CACHELINE_SIZE-1) & ~(CACHELINE_SIZE-1)]; - } mt1; - union { - pthread_mutex_t mt2_wmutex; -# define mti_wmutex mt2.mt2_wmutex - char pad[(sizeof(pthread_mutex_t)+CACHELINE_SIZE-1) & ~(CACHELINE_SIZE-1)]; - } mt2; - MDB_reader mti_readers[1]; -} MDB_txninfo; - - /** Lockfile format signature: version, features and field layout */ -#define MDB_LOCK_FORMAT \ - ((uint32_t) \ - ((MDB_LOCK_VERSION) \ - /* Flags which describe functionality */ \ - + (0 /* SYSV_SEM_FLAG */ << 18) \ - + (1 /* MDB_PIDLOCK */ << 16))) -/** @} */ - -/** Common header for all page types. The page type depends on #mp_flags. - * - * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with - * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages - * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. - * - * #P_OVERFLOW records occupy one or more contiguous pages where only the - * first has a page header. They hold the real data of #F_BIGDATA nodes. - * - * #P_SUBP sub-pages are small leaf "pages" with duplicate data. - * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. - * (Duplicate data can also go in sub-databases, which use normal pages.) - * - * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. - * - * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once - * in the snapshot: Either used by a database or listed in a freeDB record. - */ -typedef struct MDB_page { -#define mp_pgno mp_p.p_pgno -#define mp_next mp_p.p_next - union { - pgno_t p_pgno; /**< page number */ - struct MDB_page *p_next; /**< for in-memory list of freed pages */ - } mp_p; - uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ -/** @defgroup mdb_page Page Flags - * @ingroup internal - * Flags for the page headers. - * @{ - */ -#define P_BRANCH 0x01 /**< branch page */ -#define P_LEAF 0x02 /**< leaf page */ -#define P_OVERFLOW 0x04 /**< overflow page */ -#define P_META 0x08 /**< meta page */ -#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ -#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ -#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ -#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ -#define P_KEEP 0x8000 /**< leave this page alone during spill */ -/** @} */ - uint16_t mp_flags; /**< @ref mdb_page */ -#define mp_lower mp_pb.pb.pb_lower -#define mp_upper mp_pb.pb.pb_upper -#define mp_pages mp_pb.pb_pages - union { - struct { - indx_t pb_lower; /**< lower bound of free space */ - indx_t pb_upper; /**< upper bound of free space */ - } pb; - uint32_t pb_pages; /**< number of overflow pages */ - } mp_pb; - indx_t mp_ptrs[1]; /**< dynamic size */ -} MDB_page; - - /** Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) - - /** Address of first usable data byte in a page, after the header */ -#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) - - /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) - - /** Number of nodes on a page */ -#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) - - /** The amount of space remaining in the page */ -#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) - - /** The percentage of space used in the page, in tenths of a percent. */ -#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ - ((env)->me_psize - PAGEHDRSZ)) - /** The minimum page fill factor, in tenths of a percent. - * Pages emptier than this are candidates for merging. - */ -#define FILL_THRESHOLD 250 - - /** Test if a page is a leaf page */ -#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) - /** Test if a page is a LEAF2 page */ -#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) - /** Test if a page is a branch page */ -#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) - /** Test if a page is an overflow page */ -#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) - /** Test if a page is a sub page */ -#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) - - /** The number of overflow pages needed to store the given size. */ -#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - - /** Link in #MDB_txn.%mt_loose_pgs list. - * Kept outside the page header, which is needed when reusing the page. - */ -#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) - - /** Header for a single key/data pair within a page. - * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. - * We guarantee 2-byte alignment for 'MDB_node's. - * - * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child - * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used - * for pgno. (Branch nodes have no flags). Lo and hi are in host byte - * order in case some accesses can be optimized to 32-bit word access. - * - * Leaf node flags describe node contents. #F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. - * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just #F_SUBDATA). - */ -typedef struct MDB_node { - /** part of data size or pgno - * @{ */ -#if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; -#else - unsigned short mn_hi, mn_lo; -#endif - /** @} */ -/** @defgroup mdb_node Node Flags - * @ingroup internal - * Flags for node headers. - * @{ - */ -#define F_BIGDATA 0x01 /**< data put on overflow page */ -#define F_SUBDATA 0x02 /**< data is a sub-database */ -#define F_DUPDATA 0x04 /**< data has duplicates */ - -/** valid flags for #mdb_node_add() */ -#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) - -/** @} */ - unsigned short mn_flags; /**< @ref mdb_node */ - unsigned short mn_ksize; /**< key size */ - char mn_data[1]; /**< key and data are appended here */ -} MDB_node; - - /** Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDB_node, mn_data) - - /** Bit position of top word in page number, for shifting mn_flags */ -#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) - - /** Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. - */ -#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) - - /** Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. - */ -#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) - - /** Address of node \b i in page \b p */ -#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) - - /** Address of the key for the node */ -#define NODEKEY(node) (void *)((node)->mn_data) - - /** Address of the data for a node */ -#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) - - /** Get the page number pointed to by a branch node */ -#define NODEPGNO(node) \ - ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ - (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) - /** Set the page number in a branch node */ -#define SETPGNO(node,pgno) do { \ - (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ - if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) - - /** Get the size of the data in a leaf node */ -#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) - /** Set the size of the data for a leaf node */ -#define SETDSZ(node,size) do { \ - (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) - /** The size of a key in a node */ -#define NODEKSZ(node) ((node)->mn_ksize) - - /** Copy a page number from src to dst */ -#ifdef MISALIGNED_OK -# define COPY_PGNO(dst,src) dst = src -#elif SIZE_MAX > 4294967295UL -# define COPY_PGNO(dst,src) do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ - *d++ = *s++; \ - *d++ = *s++; \ - *d++ = *s++; \ - *d = *s; \ - } while (0) -#else -# define COPY_PGNO(dst,src) do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ - *d++ = *s++; \ - *d = *s; \ - } while (0) -#endif /* MISALIGNED_OK */ - -/** The address of a key in a LEAF2 page. - * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. - * There are no node headers, keys are stored contiguously. - */ -#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) - - /** Set the \b node's key into \b keyptr, if requested. */ -#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ - (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } - - /** Set the \b node's key into \b key. */ -#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } - - /** Information about a single database in the environment. */ -typedef struct MDB_db { - uint32_t md_xsize; /**< also ksize for LEAF2 pages */ - uint16_t md_flags; /**< @ref mdb_dbi_open */ - uint16_t md_depth; /**< depth of this tree */ - pgno_t md_branch_pages; /**< number of internal pages */ - pgno_t md_leaf_pages; /**< number of leaf pages */ - pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ - pgno_t md_root; /**< the root page of this tree */ -} MDB_db; - -#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ -#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) - /** #mdb_dbi_open() flags */ -#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ - MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) - - /** Handle for the DB used to track free pages. */ -#define FREE_DBI 0 - /** Handle for the default DB. */ -#define MAIN_DBI 1 - /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ -#define CORE_DBS 2 - - /** Number of meta pages - also hardcoded elsewhere */ -#define NUM_METAS 2 - - /** Meta page content. - * A meta page is the start point for accessing a database snapshot. - * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). - */ -typedef struct MDB_meta { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mm_magic; - /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ - uint32_t mm_version; - void *mm_address; /**< address for fixed mapping */ - size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ - /** The size of pages used in this DB */ -#define mm_psize mm_dbs[FREE_DBI].md_xsize - /** Any persistent environment flags. @ref mdb_env */ -#define mm_flags mm_dbs[FREE_DBI].md_flags - /** Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. - */ - pgno_t mm_last_pg; - volatile txnid_t mm_txnid; /**< txnid that committed this page */ -#define MDB_DATASIGN_NONE 0 -#define MDB_DATASIGN_WEAK 1 - volatile uint64_t mm_datasync_sign; -#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) -#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) - -#if MDBX_MODE_ENABLED - volatile mdbx_canary mm_canary; -#endif -} MDB_meta; - - /** Buffer for a stack-allocated meta page. - * The members define size and alignment, and silence type - * aliasing warnings. They are not used directly; that could - * mean incorrectly using several union members in parallel. - */ -typedef union MDB_metabuf { - MDB_page mb_page; - struct { - char mm_pad[PAGEHDRSZ]; - MDB_meta mm_meta; - } mb_metabuf; -} MDB_metabuf; - - /** Auxiliary DB info. - * The information here is mostly static/read-only. There is - * only a single copy of this record in the environment. - */ -typedef struct MDB_dbx { - MDB_val md_name; /**< name of the database */ - MDB_cmp_func *md_cmp; /**< function for comparing keys */ - MDB_cmp_func *md_dcmp; /**< function for comparing data items */ - MDB_rel_func *md_rel; /**< user relocate function */ - void *md_relctx; /**< user-provided context for md_rel */ -} MDB_dbx; - -#if MDBX_MODE_ENABLED -# define MDBX_MODE_SALT 0 -#else -# define MDBX_MODE_SALT 1115449266 -#endif - - /** A database transaction. - * Every operation requires a transaction handle. - */ -struct MDB_txn { -#define MDBX_MT_SIGNATURE (0x93D53A31^MDBX_MODE_SALT) - unsigned mt_signature; - MDB_txn *mt_parent; /**< parent of a nested txn */ - /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ - MDB_txn *mt_child; - pgno_t mt_next_pgno; /**< next unallocated page */ - /** The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. - */ - txnid_t mt_txnid; - MDB_env *mt_env; /**< the DB environment */ - /** The list of reclaimed txns from freeDB */ - MDB_IDL mt_lifo_reclaimed; - /** The list of pages that became unused during this transaction. - */ - MDB_IDL mt_free_pgs; - /** The list of loose pages that became unused and may be reused - * in this transaction, linked through #NEXT_LOOSE_PAGE(page). - */ - MDB_page *mt_loose_pgs; - /** Number of loose pages (#mt_loose_pgs) */ - int mt_loose_count; - /** The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. - */ - MDB_IDL mt_spill_pgs; - union { - /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ - MDB_ID2L dirty_list; - /** For read txns: This thread/txn's reader table slot, or NULL. */ - MDB_reader *reader; - } mt_u; - /** Array of records for each DB known in the environment. */ - MDB_dbx *mt_dbxs; - /** Array of MDB_db records for each known DB */ - MDB_db *mt_dbs; - /** Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; -/** @defgroup mt_dbflag Transaction DB Flags - * @ingroup internal - * @{ - */ -#define DB_DIRTY 0x01 /**< DB was written in this txn */ -#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ -#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ -#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ -#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ -/** @} */ - /** In write txns, array of cursors for each DB */ - MDB_cursor **mt_cursors; - /** Array of flags for each DB */ - unsigned char *mt_dbflags; - /** Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. - */ - MDB_dbi mt_numdbs; - -/** @defgroup mdb_txn Transaction Flags - * @ingroup internal - * @{ - */ - /** #mdb_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC|MDB_NOSYNC|MDB_RDONLY) -#define MDB_TXN_NOMETASYNC MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ -#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ -#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ - /* internal txn flags */ -#define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ -#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ -#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ -#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ -#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ -#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ - /** most operations on the txn are currently illegal */ -#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) -/** @} */ - unsigned mt_flags; /**< @ref mdb_txn */ - /** #dirty_list room: Array size - \#dirty pages visible to this txn. - * Includes ancestor txns' dirty pages not hidden by other txns' - * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirty_list into mt_parent after freeing hidden mt_parent pages. - */ - unsigned mt_dirty_room; - -#if MDBX_MODE_ENABLED - mdbx_canary mt_canary; -#endif -}; - -/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. - */ -#define CURSOR_STACK 32 - -struct MDB_xcursor; - - /** Cursors are used for all DB operations. - * A cursor holds a path of (page pointer, key index) from the DB - * root to a position in the DB, plus other state. #MDB_DUPSORT - * cursors include an xcursor to the current data item. Write txns - * track their cursors and keep them up to date when data moves. - * Exception: An xcursor's pointer to a #P_SUBP page can be stale. - * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). - */ -struct MDB_cursor { -#define MDBX_MC_SIGNATURE (0xFE05D5B1^MDBX_MODE_SALT) -#define MDBX_MC_READY4CLOSE (0x2817A047^MDBX_MODE_SALT) -#define MDBX_MC_WAIT4EOT (0x90E297A7^MDBX_MODE_SALT) - unsigned mc_signature; - /** Next cursor on this DB in this txn */ - MDB_cursor *mc_next; - /** Backup of the original cursor if this cursor is a shadow */ - MDB_cursor *mc_backup; - /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ - struct MDB_xcursor *mc_xcursor; - /** The transaction that owns this cursor */ - MDB_txn *mc_txn; - /** The database handle this cursor operates on */ - MDB_dbi mc_dbi; - /** The database record for this cursor */ - MDB_db *mc_db; - /** The database auxiliary record for this cursor */ - MDB_dbx *mc_dbx; - /** The @ref mt_dbflag for this database */ - unsigned char *mc_dbflag; - unsigned short mc_snum; /**< number of pushed pages */ - unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ -/** @defgroup mdb_cursor Cursor Flags - * @ingroup internal - * Cursor state flags. - * @{ - */ -#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ -#define C_EOF 0x02 /**< No more data */ -#define C_SUB 0x04 /**< Cursor is a sub-cursor */ -#define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ -#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ -/** @} */ - unsigned mc_flags; /**< @ref mdb_cursor */ - MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ - indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ -}; - - /** Context for sorted-dup records. - * We could have gone to a fully recursive design, with arbitrarily - * deep nesting of sub-databases. But for now we only handle these - * levels - main DB, optional sub-DB, sorted-duplicate DB. - */ -typedef struct MDB_xcursor { - /** A sub-cursor for traversing the Dup DB */ - MDB_cursor mx_cursor; - /** The database record for this Dup DB */ - MDB_db mx_db; - /** The auxiliary DB record for this Dup DB */ - MDB_dbx mx_dbx; - /** The @ref mt_dbflag for this Dup DB */ - unsigned char mx_dbflag; -} MDB_xcursor; - - /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ -#define XCURSOR_INITED(mc) \ - ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - - /** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed - * when the node which contains the sub-page may have moved. Called - * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. - */ -#define XCURSOR_REFRESH(mc, mp, ki) do { \ - MDB_page *xr_pg = (mp); \ - MDB_node *xr_node = NODEPTR(xr_pg, ki); \ - if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ - (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ -} while (0) - - /** State of FreeDB old pages, stored in the MDB_env */ -typedef struct MDB_pgstate { - pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ - txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ -} MDB_pgstate; - - /** Context for deferred cleanup of reader's threads. - * to avoid https://github.com/ReOpen/ReOpenLDAP/issues/48 */ -typedef struct MDBX_rthc { - struct MDBX_rthc *rc_next; - pthread_t rc_thread; - MDB_reader *rc_reader; -} MDBX_rthc; - -static MDBX_rthc* mdbx_rthc_get(pthread_key_t key); - - /** The database environment. */ -struct MDB_env { -#define MDBX_ME_SIGNATURE (0x9A899641^MDBX_MODE_SALT) - unsigned me_signature; - HANDLE me_fd; /**< The main data file */ - HANDLE me_lfd; /**< The lock file */ - /** Failed to update the meta page. Probably an I/O error. */ -#define MDB_FATAL_ERROR 0x80000000U - /** Some fields are initialized. */ -#define MDB_ENV_ACTIVE 0x20000000U - /** me_txkey is set */ -#define MDB_ENV_TXKEY 0x10000000U - uint32_t me_flags; /**< @ref mdb_env */ - unsigned me_psize; /**< DB page size, inited from me_os_psize */ - unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */ - unsigned me_maxreaders; /**< size of the reader table */ - /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ - unsigned me_close_readers; - MDB_dbi me_numdbs; /**< number of DBs opened */ - MDB_dbi me_maxdbs; /**< size of the DB table */ - pid_t me_pid; /**< process ID of this env */ - char *me_path; /**< path to the DB files */ - char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file, never NULL */ - void *me_pbuf; /**< scratch area for DUPSORT put() */ - MDB_txn *me_txn; /**< current write transaction */ - MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - pgno_t me_maxpg; /**< me_mapsize / me_psize */ - MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ - pthread_key_t me_txkey; /**< thread-key for readers */ - txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ - MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ -# define me_pglast me_pgstate.mf_pglast -# define me_pghead me_pgstate.mf_pghead - MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ - /** IDL of pages that became unused in a write txn */ - MDB_IDL me_free_pgs; - /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ - MDB_ID2L me_dirty_list; - /** Max number of freelist items that can fit in a single overflow page */ - unsigned me_maxfree_1pg; - /** Max size of a node on a page */ - unsigned me_nodemax; - unsigned me_maxkey_limit; /**< max size of a key */ - int me_live_reader; /**< have liveness lock in reader table */ - void *me_userctx; /**< User-settable context */ -#if MDB_DEBUG - MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ -#endif - uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last mdb_env_sync() */ - uint64_t me_sync_threshold; /**< Treshold of above to force synchronous flush */ -#if MDBX_MODE_ENABLED - MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ -#endif -#ifdef USE_VALGRIND - int me_valgrind_handle; -#endif -}; - - /** Nested transaction */ -typedef struct MDB_ntxn { - MDB_txn mnt_txn; /**< the transaction */ - MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ -} MDB_ntxn; - - /** max number of pages to commit in one writev() call */ -#define MDB_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES -# undef MDB_COMMIT_PAGES -# define MDB_COMMIT_PAGES IOV_MAX -#endif - - /** max bytes to write in one call */ -#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) - - /** Check \b txn and \b dbi arguments to a function */ -#define TXN_DBI_EXIST(txn, dbi, validity) \ - ((dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) - - /** Check for misused \b dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) - -#define METAPAGE_1(env) \ - (&((MDB_metabuf*) (env)->me_map)->mb_metabuf.mm_meta) - -#define METAPAGE_2(env) \ - (&((MDB_metabuf*) ((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) - -static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); -static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); -static int mdb_page_touch(MDB_cursor *mc); -static int mdb_cursor_touch(MDB_cursor *mc); - -#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ - "reset-tmp", "fail-begin", "fail-beginchild"} -enum { - /* mdb_txn_end operation number, for logging */ - MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, - MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD -}; -#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ -#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ -#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ -#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ -static int mdb_txn_end(MDB_txn *txn, unsigned mode); - -static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); -static int mdb_page_search_root(MDB_cursor *mc, - MDB_val *key, int modify); -#define MDB_PS_MODIFY 1 -#define MDB_PS_ROOTONLY 2 -#define MDB_PS_FIRST 4 -#define MDB_PS_LAST 8 -static int mdb_page_search(MDB_cursor *mc, - MDB_val *key, int flags); -static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); - -#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ -static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, - pgno_t newpgno, unsigned nflags); - -static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); -static int mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); -static void mdb_env_close0(MDB_env *env); - -static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); -static int mdb_node_add(MDB_cursor *mc, indx_t indx, - MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags); -static void mdb_node_del(MDB_cursor *mc, int ksize); -static void mdb_node_shrink(MDB_page *mp, indx_t indx); -static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); -static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); -static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); -static size_t mdb_branch_size(MDB_env *env, MDB_val *key); - -static int mdb_rebalance(MDB_cursor *mc); -static int mdb_update_key(MDB_cursor *mc, MDB_val *key); - -static void mdb_cursor_pop(MDB_cursor *mc); -static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); - -static int mdb_cursor_del0(MDB_cursor *mc); -static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); -static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); -static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); -static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); -static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, - int *exactp); -static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); -static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); - -static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); -static void mdb_xcursor_init0(MDB_cursor *mc); -static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); -static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); - -static int mdb_drop0(MDB_cursor *mc, int subs); -static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); - -/** @cond */ -static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int_ai, mdb_cmp_int_a2, mdb_cmp_int_ua; -/** @endcond */ - -#ifdef __SANITIZE_THREAD__ -static pthread_mutex_t tsan_mutex = PTHREAD_MUTEX_INITIALIZER; -#endif - -/** Return the library version info. */ -char * __cold -mdb_version(int *major, int *minor, int *patch) -{ - if (major) *major = MDB_VERSION_MAJOR; - if (minor) *minor = MDB_VERSION_MINOR; - if (patch) *patch = MDB_VERSION_PATCH; - return MDB_VERSION_STRING; -} - -/** Table of descriptions for LMDB @ref errors */ -static char *const mdb_errstr[] = { - "MDB_KEYEXIST: Key/data pair already exists", - "MDB_NOTFOUND: No matching key/data pair found", - "MDB_PAGE_NOTFOUND: Requested page not found", - "MDB_CORRUPTED: Located page was wrong type", - "MDB_PANIC: Update of meta page failed or environment had fatal error", - "MDB_VERSION_MISMATCH: Database environment version mismatch", - "MDB_INVALID: File is not an LMDB file", - "MDB_MAP_FULL: Environment mapsize limit reached", - "MDB_DBS_FULL: Environment maxdbs limit reached", - "MDB_READERS_FULL: Environment maxreaders limit reached", - "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", - "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", - "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", - "MDB_PAGE_FULL: Internal error - page has no more space", - "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", - "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", - "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", - "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", - "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", - "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", - "MDB_PROBLEM: Unexpected problem - txn should abort", -}; - -char * __cold -mdb_strerror(int err) -{ - int i; - if (!err) - return ("Successful return: 0"); - - if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { - i = err - MDB_KEYEXIST; - return mdb_errstr[i]; - } - - return strerror(err); -} - -#if MDBX_MODE_ENABLED -static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); -#endif /* MDBX_MODE_ENABLED */ - -static void mdb_debug_log(int type, const char *function, int line, const char *fmt, ...) - __attribute__((format(printf, 4, 5))); - -#if MDB_DEBUG - static txnid_t mdb_debug_edge; - - static void __cold - mdb_assert_fail(MDB_env *env, const char *msg, - const char *func, int line) - { - if (env && env->me_assert_func) - env->me_assert_func(env, msg, func, line); - else { - if (mdb_debug_logger) - mdb_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); - __assert_fail(msg, __FILE__, line, func); - } - } - -# define mdb_assert_enabled() \ - unlikely(mdb_runtime_flags & MDBX_DBG_ASSERT) - -# define mdb_audit_enabled() \ - unlikely(mdb_runtime_flags & MDBX_DBG_AUDIT) - -# define mdb_debug_enabled(type) \ - unlikely(mdb_runtime_flags & \ - (type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) - -#else -# ifndef NDEBUG -# define mdb_debug_enabled(type) (1) -# else -# define mdb_debug_enabled(type) (0) -# endif -# define mdb_audit_enabled() (0) -# define mdb_assert_enabled() (0) -# define mdb_assert_fail(env, msg, func, line) \ - __assert_fail(msg, __FILE__, line, func) -#endif /* MDB_DEBUG */ - -static void __cold -mdb_debug_log(int type, const char *function, int line, const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - if (mdb_debug_logger) - mdb_debug_logger(type, function, line, fmt, args); - else { - if (function && line > 0) - fprintf(stderr, "%s:%d ", function, line); - else if (function) - fprintf(stderr, "%s: ", function); - else if (line > 0) - fprintf(stderr, "%d: ", line); - vfprintf(stderr, fmt, args); - } - va_end(args); -} - -#define mdb_print(fmt, ...) \ - mdb_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) - -#define mdb_debug(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_TRACE)) \ - mdb_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", ##__VA_ARGS__); \ - } while(0) - -#define mdb_debug_print(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_TRACE)) \ - mdb_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ - } while(0) - -#define mdb_debug_extra(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) \ - mdb_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, ##__VA_ARGS__); \ - } while(0) - -#define mdb_debug_extra_print(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) \ - mdb_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ - } while(0) - -#define mdb_ensure_msg(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - mdb_assert_fail(env, msg, __FUNCTION__, __LINE__); \ - } while(0) - -#define mdb_ensure(env, expr) \ - mdb_ensure_msg(env, expr, #expr) - -/** assert(3) variant in environment context */ -#define mdb_assert(env, expr) \ - do { \ - if (mdb_assert_enabled()) \ - mdb_ensure(env, expr); \ - } while(0) - -/** assert(3) variant in cursor context */ -#define mdb_cassert(mc, expr) \ - mdb_assert((mc)->mc_txn->mt_env, expr) - -/** assert(3) variant in transaction context */ -#define mdb_tassert(txn, expr) \ - mdb_assert((txn)->mt_env, expr) - -/** Return the page number of \b mp which may be sub-page, for debug output */ -static MDBX_INLINE pgno_t -mdb_dbg_pgno(MDB_page *mp) -{ - pgno_t ret; - COPY_PGNO(ret, mp->mp_pgno); - return ret; -} - -/** Display a key in hexadecimal and return the address of the result. - * @param[in] key the key to display - * @param[in] buf the buffer to write into. Should always be #DKBUF. - * @return The key in hexadecimal form. - */ -char * -mdb_dkey(MDB_val *key, char *buf) -{ - char *ptr = buf; - unsigned i; - - if (!key) - return ""; - - if (key->mv_size > DKBUF_MAXKEYSIZE) - return "MDB_MAXKEYSIZE"; - /* may want to make this a dynamic check: if the key is mostly - * printable characters, print it as-is instead of converting to hex. */ -#if 1 - buf[0] = '\0'; - for (i=0; imv_size; i++) - ptr += sprintf(ptr, "%02x", ((unsigned char*) key->mv_data)[i]); -#else - sprintf(buf, "%.*s", key->mv_size, key->mv_data); -#endif - return buf; -} - -#if 0 /* LY: debug stuff */ -static const char * -mdb_leafnode_type(MDB_node *n) -{ - static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : - tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; -} - -/** Display all the keys in the page. */ -static void -mdb_page_list(MDB_page *mp) -{ - pgno_t pgno = mdb_dbg_pgno(mp); - const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; - MDB_node *node; - unsigned i, nkeys, nsize, total = 0; - MDB_val key; - DKBUF; - - switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { - case P_BRANCH: type = "Branch page"; break; - case P_LEAF: type = "Leaf page"; break; - case P_LEAF|P_SUBP: type = "Sub-page"; break; - case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; - case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; - case P_OVERFLOW: - mdb_print("Overflow page %zu pages %u%s\n", - pgno, mp->mp_pages, state); - return; - case P_META: - mdb_print("Meta-page %zu txnid %zu\n", - pgno, ((MDB_meta *)PAGEDATA(mp))->mm_txnid); - return; - default: - mdb_print("Bad page %zu flags 0x%X\n", pgno, mp->mp_flags); - return; - } - - nkeys = NUMKEYS(mp); - mdb_print("%s %zu numkeys %u%s\n", type, pgno, nkeys, state); - - for (i=0; imp_leaf2_ksize; - key.mv_data = LEAF2KEY(mp, i, nsize); - total += nsize; - mdb_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); - continue; - } - node = NODEPTR(mp, i); - key.mv_size = node->mn_ksize; - key.mv_data = node->mn_data; - nsize = NODESIZE + key.mv_size; - if (IS_BRANCH(mp)) { - mdb_print("key %u: page %zu, %s\n", i, NODEPGNO(node), DKEY(&key)); - total += nsize; - } else { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - nsize += sizeof(pgno_t); - else - nsize += NODEDSZ(node); - total += nsize; - nsize += sizeof(indx_t); - mdb_print("key %u: nsize %u, %s%s\n", - i, nsize, DKEY(&key), mdb_leafnode_type(node)); - } - total = EVEN(total); - } - mdb_print("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); -} - -static void -mdb_cursor_chk(MDB_cursor *mc) -{ - unsigned i; - MDB_node *node; - MDB_page *mp; - - if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; - for (i=0; imc_top; i++) { - mp = mc->mc_pg[i]; - node = NODEPTR(mp, mc->mc_ki[i]); - if (unlikely(NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)) - mdb_print("oops!\n"); - } - if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) - mdb_print("ack!\n"); - if (XCURSOR_INITED(mc)) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && - mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { - mdb_print("blah!\n"); - } - } -} -#endif /* 0 */ - -/** Count all the pages in each DB and in the freelist - * and make sure it matches the actual number of pages - * being used. - * All named DBs must be open for a correct count. - */ -static void mdb_audit(MDB_txn *txn) -{ - MDB_cursor mc; - MDB_val key, data; - MDB_ID freecount, count; - MDB_dbi i; - int rc; - - freecount = 0; - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; - mdb_tassert(txn, rc == MDB_NOTFOUND); - - count = 0; - for (i = 0; imt_numdbs; i++) { - MDB_xcursor mx; - if (!(txn->mt_dbflags[i] & DB_VALID)) - continue; - mdb_cursor_init(&mc, txn, i, &mx); - if (txn->mt_dbs[i].md_root == P_INVALID) - continue; - count += txn->mt_dbs[i].md_branch_pages + - txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; - if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { - rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); - for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { - unsigned j; - MDB_page *mp; - mp = mc.mc_pg[mc.mc_top]; - for (j=0; jmn_flags & F_SUBDATA) { - MDB_db db; - memcpy(&db, NODEDATA(leaf), sizeof(db)); - count += db.md_branch_pages + db.md_leaf_pages + - db.md_overflow_pages; - } - } - } - mdb_tassert(txn, rc == MDB_NOTFOUND); - } - } - if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - mdb_print("audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", - txn->mt_txnid, freecount, count+NUM_METAS, - freecount+count+NUM_METAS, txn->mt_next_pgno); - } -} - -int -mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) -{ - mdb_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_cmp(a, b); -} - -int -mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) -{ - mdb_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_dcmp(a, b); -} - -/** Allocate memory for a page. - * Re-use old malloc'd pages first for singletons, otherwise just malloc. - * Set #MDB_TXN_ERROR on failure. - */ -static MDB_page * -mdb_page_malloc(MDB_txn *txn, unsigned num) -{ - MDB_env *env = txn->mt_env; - size_t size = env->me_psize; - MDB_page *np = env->me_dpages; - if (likely(num == 1 && np)) { - ASAN_UNPOISON_MEMORY_REGION(np, size); - VALGRIND_MEMPOOL_ALLOC(env, np, size); - VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); - env->me_dpages = np->mp_next; - } else { - size *= num; - np = malloc(size); - if (unlikely(! np)) { - txn->mt_flags |= MDB_TXN_ERROR; - return np; - } - VALGRIND_MEMPOOL_ALLOC(env, np, size); - } - - if ((env->me_flags & MDB_NOMEMINIT) == 0) { - /* For a single page alloc, we init everything after the page header. - * For multi-page, we init the final page; if the caller needed that - * many pages they will be filling in at least up to the last page. */ - size_t skip = PAGEHDRSZ; - if (num > 1) - skip += (num - 1) * env->me_psize; - memset((char *) np + skip, 0, size - skip); - } - VALGRIND_MAKE_MEM_UNDEFINED(np, size); - np->mp_flags = 0; - np->mp_pages = num; - return np; -} - -/** Free a single page. - * Saves single pages to a list, for future reuse. - * (This is not used for multi-page overflow pages.) - */ -static MDBX_INLINE void -mdb_page_free(MDB_env *env, MDB_page *mp) -{ - mp->mp_next = env->me_dpages; - VALGRIND_MEMPOOL_FREE(env, mp); - env->me_dpages = mp; -} - -/** Free a dirty page */ -static void -mdb_dpage_free(MDB_env *env, MDB_page *dp) -{ - if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - mdb_page_free(env, dp); - } else { - /* large pages just get freed directly */ - VALGRIND_MEMPOOL_FREE(env, dp); - free(dp); - } -} - -/** Return all dirty pages to dpage list */ -static void -mdb_dlist_free(MDB_txn *txn) -{ - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned i, n = dl[0].mid; - - for (i = 1; i <= n; i++) { - mdb_dpage_free(env, dl[i].mptr); - } - dl[0].mid = 0; -} - -static void __cold -mdb_kill_page(MDB_env *env, pgno_t pgno) -{ - const size_t offs = env->me_psize * pgno; - const size_t shift = offsetof(MDB_page, mp_pb); - - if (env->me_flags & MDB_WRITEMAP) { - MDB_page *mp = (MDB_page *)(env->me_map + offs); - memset(&mp->mp_pb, 0x6F /* 'o', 111 */, env->me_psize - shift); - VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pb, env->me_psize - shift); - ASAN_POISON_MEMORY_REGION(&mp->mp_pb, env->me_psize - shift); - } else { - struct iovec iov[1]; - iov[0].iov_len = env->me_psize - shift; - iov[0].iov_base = alloca(iov[0].iov_len); - memset(iov[0].iov_base, 0x6F /* 'o', 111 */, iov[0].iov_len); - ssize_t rc = pwritev(env->me_fd, iov, 1, offs + shift); - assert(rc == (ssize_t) iov[0].iov_len); - (void) rc; - } -} - -/** Loosen or free a single page. - * Saves single pages to a list for future reuse - * in this same txn. It has been pulled from the freeDB - * and already resides on the dirty list, but has been - * deleted. Use these pages first before pulling again - * from the freeDB. - * - * If the page wasn't dirtied in this txn, just add it - * to this txn's free list. - */ -static int -mdb_page_loose(MDB_cursor *mc, MDB_page *mp) -{ - int loose = 0; - pgno_t pgno = mp->mp_pgno; - MDB_txn *txn = mc->mc_txn; - - if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { - if (txn->mt_parent) { - MDB_ID2 *dl = txn->mt_u.dirty_list; - /* If txn has a parent, make sure the page is in our - * dirty list. */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - /* ok, it's ours */ - loose = 1; - } - } - } else { - /* no parent txn, so it's just ours */ - loose = 1; - } - } - if (loose) { - mdb_debug("loosen db %d page %zu", DDBI(mc), mp->mp_pgno); - MDB_page **link = &NEXT_LOOSE_PAGE(mp); - if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { - mdb_kill_page(txn->mt_env, pgno); - VALGRIND_MAKE_MEM_UNDEFINED(link, sizeof(MDB_page*)); - ASAN_UNPOISON_MEMORY_REGION(link, sizeof(MDB_page*)); - } - *link = txn->mt_loose_pgs; - txn->mt_loose_pgs = mp; - txn->mt_loose_count++; - mp->mp_flags |= P_LOOSE; - } else { - int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); - if (unlikely(rc)) - return rc; - } - - return MDB_SUCCESS; -} - -/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. - * @param[in] mc A cursor handle for the current operation. - * @param[in] pflags Flags of the pages to update: - * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. - * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). - * @return 0 on success, non-zero on failure. - */ -static int -mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) -{ - enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3, *m0 = mc; - MDB_xcursor *mx; - MDB_page *dp, *mp; - MDB_node *leaf; - unsigned i, j; - int rc = MDB_SUCCESS, level; - - /* Mark pages seen by cursors: First m0, then tracked cursors */ - for (i = txn->mt_numdbs;; ) { - if (mc->mc_flags & C_INITIALIZED) { - for (m3 = mc;; m3 = &mx->mx_cursor) { - mp = NULL; - for (j=0; jmc_snum; j++) { - mp = m3->mc_pg[j]; - if ((mp->mp_flags & Mask) == pflags) - mp->mp_flags ^= P_KEEP; - } - mx = m3->mc_xcursor; - /* Proceed to mx if it is at a sub-database */ - if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) - break; - if (! (mp && (mp->mp_flags & P_LEAF))) - break; - leaf = NODEPTR(mp, m3->mc_ki[j-1]); - if (!(leaf->mn_flags & F_SUBDATA)) - break; - } - } - mc = mc->mc_next; - for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) - if (i == 0) - goto mark_done; - } - -mark_done: - if (all) { - /* Mark dirty root pages */ - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) - continue; - if (unlikely((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS)) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) - dp->mp_flags ^= P_KEEP; - } - } - } - - return rc; -} - -static int mdb_page_flush(MDB_txn *txn, int keep); - -/** Spill pages from the dirty list back to disk. - * This is intended to prevent running into #MDB_TXN_FULL situations, - * but note that they may still occur in a few cases: - * 1) our estimate of the txn size could be too small. Currently this - * seems unlikely, except with a large number of #MDB_MULTIPLE items. - * 2) child txns may run out of space if their parents dirtied a - * lot of pages and never spilled them. TODO: we probably should do - * a preemptive spill during #mdb_txn_begin() of a child txn, if - * the parent's dirty_room is below a given threshold. - * - * Otherwise, if not using nested txns, it is expected that apps will - * not run into #MDB_TXN_FULL any more. The pages are flushed to disk - * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. - * If the txn never references them again, they can be left alone. - * If the txn only reads them, they can be used without any fuss. - * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of #mdb_page_touch(). Such references are - * handled by #mdb_page_unspill(). - * - * Also note, we never spill DB root pages, nor pages of active cursors, - * because we'll need these back again soon anyway. And in nested txns, - * we can't spill a page in a child txn if it was already spilled in a - * parent txn. That would alter the parent txns' data even though - * the child hasn't committed yet, and we'd have no way to undo it if - * the child aborted. - * - * @param[in] m0 cursor A cursor handle identifying the transaction and - * database for which we are checking space. - * @param[in] key For a put operation, the key being stored. - * @param[in] data For a put operation, the data being stored. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) -{ - MDB_txn *txn = m0->mc_txn; - MDB_page *dp; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned i, j, need; - int rc; - - if (m0->mc_flags & C_SUB) - return MDB_SUCCESS; - - /* Estimate how much space this op will take */ - i = m0->mc_db->md_depth; - /* Named DBs also dirty the main DB */ - if (m0->mc_dbi >= CORE_DBS) - i += txn->mt_dbs[MAIN_DBI].md_depth; - /* For puts, roughly factor in the key+data size */ - if (key) - i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; - i += i; /* double it for good measure */ - need = i; - - if (txn->mt_dirty_room > i) - return MDB_SUCCESS; - - if (!txn->mt_spill_pgs) { - txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); - if (unlikely(!txn->mt_spill_pgs)) - return ENOMEM; - } else { - /* purge deleted slots */ - MDB_IDL sl = txn->mt_spill_pgs; - unsigned num = sl[0]; - j=0; - for (i=1; i<=num; i++) { - if (!(sl[i] & 1)) - sl[++j] = sl[i]; - } - sl[0] = j; - } - - /* Preserve pages which may soon be dirtied again */ - rc = mdb_pages_xkeep(m0, P_DIRTY, 1); - if (unlikely(rc != MDB_SUCCESS)) - goto bailout; - - /* Less aggressive spill - we originally spilled the entire dirty list, - * with a few exceptions for cursor pages and DB root pages. But this - * turns out to be a lot of wasted effort because in a large txn many - * of those pages will need to be used again. So now we spill only 1/8th - * of the dirty pages. Testing revealed this to be a good tradeoff, - * better than 1/2, 1/4, or 1/10. */ - if (need < MDB_IDL_UM_MAX / 8) - need = MDB_IDL_UM_MAX / 8; - - /* Save the page IDs of all the pages we're flushing */ - /* flush from the tail forward, this saves a lot of shifting later on. */ - for (i=dl[0].mid; i && need; i--) { - MDB_ID pn = dl[i].mid << 1; - dp = dl[i].mptr; - if (dp->mp_flags & (P_LOOSE|P_KEEP)) - continue; - /* Can't spill twice, make sure it's not already in a parent's - * spill list. */ - if (txn->mt_parent) { - MDB_txn *tx2; - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - if (tx2->mt_spill_pgs) { - j = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { - dp->mp_flags |= P_KEEP; - break; - } - } - } - if (tx2) - continue; - } - rc = mdb_midl_append(&txn->mt_spill_pgs, pn); - if (unlikely(rc != MDB_SUCCESS)) - goto bailout; - need--; - } - mdb_midl_sort(txn->mt_spill_pgs); - - /* Flush the spilled part of dirty list */ - rc = mdb_page_flush(txn, i); - if (unlikely(rc != MDB_SUCCESS)) - goto bailout; - - /* Reset any dirty pages we kept that page_flush didn't see */ - rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); - -bailout: - txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; - return rc; -} - -static MDBX_INLINE uint64_t -mdb_meta_sign(MDB_meta *meta) { - uint64_t sign = MDB_DATASIGN_NONE; -#if 0 /* TODO */ - sign = hippeus_hash64( - &meta->mm_mapsize, - sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize), - meta->mm_version | (uint64_t) MDB_MAGIC << 32 - ); -#else - (void) meta; -#endif - /* LY: newer returns MDB_DATASIGN_NONE or MDB_DATASIGN_WEAK */ - return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; -} - -static MDBX_INLINE MDB_meta* -mdb_meta_head_w(MDB_env *env) { - MDB_meta* a = METAPAGE_1(env); - MDB_meta* b = METAPAGE_2(env); - txnid_t head_txnid = env->me_txns->mti_txnid; - - mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - if (a->mm_txnid == head_txnid) - return a; - if (likely(b->mm_txnid == head_txnid)) - return b; - - mdb_debug("me_txns->mti_txnid not match meta-pages"); - mdb_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid); - env->me_flags |= MDB_FATAL_ERROR; - return a; -} - -static MDB_meta* -mdb_meta_head_r(MDB_env *env) { - MDB_meta* a = METAPAGE_1(env); - MDB_meta* b = METAPAGE_2(env), *h; - -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - - txnid_t head_txnid = env->me_txns->mti_txnid; - mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - if (likely(a->mm_txnid == head_txnid)) { - h = a; - } else if (likely(b->mm_txnid == head_txnid)) { - h = b; - } else { - /* LY: seems got a collision with mdb_env_sync0() */ - mdbx_coherent_barrier(); - head_txnid = env->me_txns->mti_txnid; - mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - - if (likely(a->mm_txnid == head_txnid)) { - h = a; - } else if (likely(b->mm_txnid == head_txnid)) { - h = b; - } else { - /* LY: got a race again, or DB is corrupted */ - int rc = mdb_mutex_lock(env, MDB_MUTEX(env, w)); - h = mdb_meta_head_w(env); - if (rc == 0) - mdb_mutex_unlock(env, MDB_MUTEX(env, w)); - } - } - -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - - return h; -} - -static MDBX_INLINE MDB_meta* -mdb_env_meta_flipflop(const MDB_env *env, MDB_meta* meta) { - return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); -} - -static MDBX_INLINE int -mdb_meta_lt(MDB_meta* a, MDB_meta* b) { - return (META_IS_STEADY(a) == META_IS_STEADY(b)) - ? a->mm_txnid < b->mm_txnid : META_IS_STEADY(b); -} - -/** Find oldest txnid still referenced. */ -static -txnid_t mdb_find_oldest(MDB_env *env, int *laggard) -{ -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - int i, reader; - MDB_reader *r = env->me_txns->mti_readers; - txnid_t oldest = env->me_txns->mti_txnid; - - MDB_meta* a = METAPAGE_1(env); - MDB_meta* b = METAPAGE_2(env); - if (META_IS_WEAK(a) && oldest > b->mm_txnid) - oldest = b->mm_txnid; - if (META_IS_WEAK(b) && oldest > a->mm_txnid) - oldest = a->mm_txnid; - - for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - txnid_t snap = r[i].mr_txnid; - if (oldest > snap) { - oldest = snap; - reader = i; - } - } - } -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - - if (laggard) - *laggard = reader; - return env->me_pgoldest = oldest; -} - -/** Add a page to the txn's dirty list */ -static void -mdb_page_dirty(MDB_txn *txn, MDB_page *mp) -{ - MDB_ID2 mid; - int rc, (*insert)(MDB_ID2L, MDB_ID2 *); - - if (txn->mt_flags & MDB_TXN_WRITEMAP) { - insert = mdb_mid2l_append; - } else { - insert = mdb_mid2l_insert; - } - mid.mid = mp->mp_pgno; - mid.mptr = mp; - rc = insert(txn->mt_u.dirty_list, &mid); - mdb_tassert(txn, rc == 0); - txn->mt_dirty_room--; -} - -/** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. - * - * If there are free pages available from older transactions, they - * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the freedB, just merge freeDB records into me_pghead[] - * and move me_pglast to say which records were consumed. Only this - * function can create me_pghead and move me_pglast/mt_next_pgno. - * @param[in] mc cursor A cursor handle identifying the transaction and - * database for which we are allocating. - * @param[in] num the number of pages to allocate. - * @param[out] mp Address of the allocated page(s). Requests for multiple pages - * will always be satisfied by a single contiguous chunk of memory. - * @return 0 on success, non-zero on failure. - */ - -#define MDBX_ALLOC_CACHE 1 -#define MDBX_ALLOC_GC 2 -#define MDBX_ALLOC_NEW 4 -#define MDBX_ALLOC_KICK 8 -#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE|MDBX_ALLOC_GC|MDBX_ALLOC_NEW|MDBX_ALLOC_KICK) - -static int -mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) -{ - int rc; - MDB_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; - pgno_t pgno, *mop = env->me_pghead; - unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num-1; - MDB_page *np; - txnid_t oldest = 0, last = 0; - MDB_cursor_op op; - MDB_cursor m2; - int found_oldest = 0; - - if (likely(flags & MDBX_ALLOC_GC)) { - flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); - if (unlikely(mc->mc_flags & C_RECLAIMING)) { - /* If mc is updating the freeDB, then the freelist cannot play - * catch-up with itself by growing while trying to save it. */ - flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); - } - } - - if (likely(flags & MDBX_ALLOC_CACHE)) { - /* If there are any loose pages, just use them */ - assert(mp && num); - if (likely(num == 1 && txn->mt_loose_pgs)) { - np = txn->mt_loose_pgs; - txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); - txn->mt_loose_count--; - mdb_debug("db %d use loose page %zu", DDBI(mc), np->mp_pgno); - ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); - *mp = np; - return MDB_SUCCESS; - } - } - - /* If our dirty list is already full, we can't do anything */ - if (unlikely(txn->mt_dirty_room == 0)) { - rc = MDB_TXN_FULL; - goto fail; - } - - for (;;) { /* oom-kick retry loop */ - for (op = MDB_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDB_PREV : MDB_NEXT) { - MDB_val key, data; - MDB_node *leaf; - pgno_t *idl; - - /* Seek a big enough contiguous page range. Prefer - * pages at the tail, just truncating the list. */ - if (likely(flags & MDBX_ALLOC_CACHE) - && mop_len > n2 - && ( !(flags & MDBX_COALESCE) || op == MDB_FIRST)) { - i = mop_len; - do { - pgno = mop[i]; - if (likely(mop[i-n2] == pgno+n2)) - goto done; - } while (--i > n2); - } - - if (op == MDB_FIRST) { /* 1st iteration */ - /* Prepare to fetch more and coalesce */ - if (unlikely( !(flags & MDBX_ALLOC_GC) )) - break; - - oldest = env->me_pgoldest; - mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (flags & MDBX_LIFORECLAIM) { - if (! found_oldest) { - oldest = mdb_find_oldest(env, NULL); - found_oldest = 1; - } - /* Begin from oldest reader if any */ - if (oldest > 2) { - last = oldest - 1; - op = MDB_SET_RANGE; - } - } else if (env->me_pglast) { - /* Continue lookup from env->me_pglast to higher/last */ - last = env->me_pglast; - op = MDB_SET_RANGE; - } - - key.mv_data = &last; - key.mv_size = sizeof(last); - } - - if (! (flags & MDBX_LIFORECLAIM) ) { - /* Do not fetch more if the record will be too recent */ - if (op != MDB_FIRST && ++last >= oldest) { - if (!found_oldest) { - oldest = mdb_find_oldest(env, NULL); - found_oldest = 1; - } - if (oldest <= last) - break; - } - } - - rc = mdb_cursor_get(&m2, &key, NULL, op); - if (rc == MDB_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { - if (op == MDB_SET_RANGE) - continue; - found_oldest = 1; - if (oldest < mdb_find_oldest(env, NULL)) { - oldest = env->me_pgoldest; - last = oldest - 1; - key.mv_data = &last; - key.mv_size = sizeof(last); - op = MDB_SET_RANGE; - rc = mdb_cursor_get(&m2, &key, NULL, op); - } - } - if (unlikely(rc)) { - if (rc == MDB_NOTFOUND) - break; - goto fail; - } - - last = *(txnid_t*)key.mv_data; - if (oldest <= last) { - if (!found_oldest) { - oldest = mdb_find_oldest(env, NULL); - found_oldest = 1; - } - if (oldest <= last) { - if (flags & MDBX_LIFORECLAIM) - continue; - break; - } - } - - if (flags & MDBX_LIFORECLAIM) { - if (txn->mt_lifo_reclaimed) { - for(j = txn->mt_lifo_reclaimed[0]; j > 0; --j) - if (txn->mt_lifo_reclaimed[j] == last) - break; - if (j) - continue; - } - } - - np = m2.mc_pg[m2.mc_top]; - leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if (unlikely((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS)) - goto fail; - - if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { - txn->mt_lifo_reclaimed = mdb_midl_alloc(env->me_maxfree_1pg); - if (unlikely(!txn->mt_lifo_reclaimed)) { - rc = ENOMEM; - goto fail; - } - } - - idl = (MDB_ID *) data.mv_data; - mdb_tassert(txn, idl[0] == 0 || data.mv_size == (idl[0] + 1) * sizeof(MDB_ID)); - i = idl[0]; - if (!mop) { - if (unlikely(!(env->me_pghead = mop = mdb_midl_alloc(i)))) { - rc = ENOMEM; - goto fail; - } - } else { - if (unlikely((rc = mdb_midl_need(&env->me_pghead, i)) != 0)) - goto fail; - mop = env->me_pghead; - } - if (flags & MDBX_LIFORECLAIM) { - if ((rc = mdb_midl_append(&txn->mt_lifo_reclaimed, last)) != 0) - goto fail; - } - env->me_pglast = last; - - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) { - mdb_debug_extra("IDL read txn %zu root %zu num %u, IDL", - last, txn->mt_dbs[FREE_DBI].md_root, i); - for (j = i; j; j--) - mdb_debug_extra_print(" %zu", idl[j]); - mdb_debug_extra_print("\n"); - } - - /* Merge in descending sorted order */ - mdb_midl_xmerge(mop, idl); - mop_len = mop[0]; - - if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { - /* force gc reclaim mode */ - return MDB_SUCCESS; - } - - /* Don't try to coalesce too much. */ - if (mop_len > MDB_IDL_UM_SIZE / 2) - break; - if (flags & MDBX_COALESCE) { - if (mop_len /* current size */ >= env->me_maxfree_1pg / 2 - || i /* prev size */ >= env->me_maxfree_1pg / 4) - flags &= ~MDBX_COALESCE; - } - } - - if ((flags & (MDBX_COALESCE|MDBX_ALLOC_CACHE)) == (MDBX_COALESCE|MDBX_ALLOC_CACHE) - && mop_len > n2) { - i = mop_len; - do { - pgno = mop[i]; - if (mop[i-n2] == pgno+n2) - goto done; - } while (--i > n2); - } - - /* Use new pages from the map when nothing suitable in the freeDB */ - i = 0; - pgno = txn->mt_next_pgno; - rc = MDB_MAP_FULL; - if (likely(pgno + num <= env->me_maxpg)) { - rc = MDB_NOTFOUND; - if (likely(flags & MDBX_ALLOC_NEW)) - goto done; - } - - if ((flags & MDBX_ALLOC_GC) - && ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) { - MDB_meta* head = mdb_meta_head_w(env); - MDB_meta* tail = mdb_env_meta_flipflop(env, head); - - if (oldest == tail->mm_txnid - && META_IS_WEAK(head) && !META_IS_WEAK(tail)) { - MDB_meta meta = *head; - /* LY: Here an oom was happened: - * - all pages had allocated; - * - reclaiming was stopped at the last steady-sync; - * - the head-sync is weak. - * Now we need make a sync to resume reclaiming. If both - * MDB_NOSYNC and MDB_MAPASYNC flags are set, then assume that - * utterly no-sync write mode was requested. In such case - * don't make a steady-sync, but only a legacy-mode checkpoint, - * just for resume reclaiming only, not for data consistency. */ - - mdb_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu", - head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', - tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', - oldest, env->me_txns->mt1.mtb.mtb_txnid ); - - int flags = env->me_flags & MDB_WRITEMAP; - if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) - flags |= MDBX_UTTERLY_NOSYNC; - - mdb_assert(env, env->me_sync_pending > 0); - if (mdb_env_sync0(env, flags, &meta) == MDB_SUCCESS) { - txnid_t snap = mdb_find_oldest(env, NULL); - if (snap > oldest) { - continue; - } - } - } - - if (rc == MDB_MAP_FULL) { -#if MDBX_MODE_ENABLED - txnid_t snap = mdbx_oomkick(env, oldest); -#else - mdb_debug("DB size maxed out"); - txnid_t snap = mdb_find_oldest(env, NULL); -#endif /* MDBX_MODE_ENABLED */ - if (snap > oldest) { - oldest = snap; - continue; - } - } - } - -fail: - if (mp) { - *mp = NULL; - txn->mt_flags |= MDB_TXN_ERROR; - } - assert(rc); - return rc; - } - -done: - assert(mp && num); - if (env->me_flags & MDB_WRITEMAP) { - np = (MDB_page *)(env->me_map + env->me_psize * pgno); - /* LY: reset no-access flag from mdb_kill_page() */ - VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); - ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize * num); - } else { - if (unlikely(!(np = mdb_page_malloc(txn, num)))) { - rc = ENOMEM; - goto fail; - } - } - if (i) { - mop[0] = mop_len -= num; - /* Move any stragglers down */ - for (j = i-num; j < mop_len; ) - mop[++j] = mop[++i]; - } else { - txn->mt_next_pgno = pgno + num; - } - - if (env->me_flags & MDBX_PAGEPERTURB) - memset(np, 0x71 /* 'q', 113 */, env->me_psize * num); - VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); - - np->mp_pgno = pgno; - np->mp_leaf2_ksize = 0; - np->mp_flags = 0; - np->mp_pages = num; - mdb_page_dirty(txn, np); - *mp = np; - - return MDB_SUCCESS; -} - -/** Copy the used portions of a non-overflow page. - * @param[in] dst page to copy into - * @param[in] src page to copy from - * @param[in] psize size of a page - */ -static void -mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) -{ - enum { Align = sizeof(pgno_t) }; - indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; - - /* If page isn't full, just copy the used portion. Adjust - * alignment so memcpy may copy words instead of bytes. */ - if ((unused &= -Align) && !IS_LEAF2(src)) { - upper = (upper + PAGEBASE) & -Align; - memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); - memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), - psize - upper); - } else { - memcpy(dst, src, psize - unused); - } -} - -/** Pull a page off the txn's spill list, if present. - * If a page being referenced was spilled to disk in this txn, bring - * it back and make it dirty/writable again. - * @param[in] txn the transaction handle. - * @param[in] mp the page being referenced. It must not be dirty. - * @param[out] ret the writable page, if any. ret is unchanged if - * mp wasn't spilled. - */ -static int -mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) -{ - MDB_env *env = txn->mt_env; - const MDB_txn *tx2; - unsigned x; - pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - - for (tx2 = txn; tx2; tx2=tx2->mt_parent) { - if (!tx2->mt_spill_pgs) - continue; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - MDB_page *np; - int num; - if (txn->mt_dirty_room == 0) - return MDB_TXN_FULL; - if (IS_OVERFLOW(mp)) - num = mp->mp_pages; - else - num = 1; - if (env->me_flags & MDB_WRITEMAP) { - np = mp; - } else { - np = mdb_page_malloc(txn, num); - if (unlikely(!np)) - return ENOMEM; - if (num > 1) - memcpy(np, mp, num * env->me_psize); - else - mdb_page_copy(np, mp, env->me_psize); - } - if (tx2 == txn) { - /* If in current txn, this page is no longer spilled. - * If it happens to be the last page, truncate the spill list. - * Otherwise mark it as deleted by setting the LSB. */ - if (x == txn->mt_spill_pgs[0]) - txn->mt_spill_pgs[0]--; - else - txn->mt_spill_pgs[x] |= 1; - } /* otherwise, if belonging to a parent txn, the - * page remains spilled until child commits - */ - - mdb_page_dirty(txn, np); - np->mp_flags |= P_DIRTY; - *ret = np; - break; - } - } - return MDB_SUCCESS; -} - -/** Touch a page: make it dirty and re-insert into tree with updated pgno. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc cursor pointing to the page to be touched - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_touch(MDB_cursor *mc) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top], *np; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m2, *m3; - pgno_t pgno; - int rc; - - if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if (txn->mt_flags & MDB_TXN_SPILLS) { - np = NULL; - rc = mdb_page_unspill(txn, mp, &np); - if (unlikely(rc)) - goto fail; - if (likely(np)) - goto done; - } - if (unlikely((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || - (rc = mdb_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) - goto fail; - pgno = np->mp_pgno; - mdb_debug("touched db %d page %zu -> %zu", DDBI(mc), mp->mp_pgno, pgno); - mdb_cassert(mc, mp->mp_pgno != pgno); - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - /* Update the parent page, if any, to point to the new page */ - if (mc->mc_top) { - MDB_page *parent = mc->mc_pg[mc->mc_top-1]; - MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); - SETPGNO(node, pgno); - } else { - mc->mc_db->md_root = pgno; - } - } else if (txn->mt_parent && !IS_SUBP(mp)) { - MDB_ID2 mid, *dl = txn->mt_u.dirty_list; - pgno = mp->mp_pgno; - /* If txn has a parent, make sure the page is in our - * dirty list. */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - return 0; - } - } - mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); - /* No - copy it */ - np = mdb_page_malloc(txn, 1); - if (unlikely(!np)) - return ENOMEM; - mid.mid = pgno; - mid.mptr = np; - rc = mdb_mid2l_insert(dl, &mid); - mdb_cassert(mc, rc == 0); - } else { - return 0; - } - - mdb_page_copy(np, mp, txn->mt_env->me_psize); - np->mp_pgno = pgno; - np->mp_flags |= P_DIRTY; - -done: - /* Adjust cursors pointing to mp */ - mc->mc_pg[mc->mc_top] = np; - m2 = txn->mt_cursors[mc->mc_dbi]; - if (mc->mc_flags & C_SUB) { - for (; m2; m2=m2->mc_next) { - m3 = &m2->mc_xcursor->mx_cursor; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[mc->mc_top] == mp) - m3->mc_pg[mc->mc_top] = np; - } - } else { - for (; m2; m2=m2->mc_next) { - if (m2->mc_snum < mc->mc_snum) continue; - if (m2 == mc) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - m2->mc_pg[mc->mc_top] = np; - if (XCURSOR_INITED(m2) && IS_LEAF(np)) - XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); - } - } - } - return 0; - -fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_env_sync(MDB_env *env, int force) -{ - int rc; - pthread_mutex_t *mutex; - MDB_meta *head; - unsigned flags; - - if (unlikely(! env)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(! env->me_txns)) - return MDB_PANIC; - - flags = env->me_flags & ~MDB_NOMETASYNC; - if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) - return EACCES; - - head = mdb_meta_head_r(env); - if (! META_IS_WEAK(head) && env->me_sync_pending == 0 - && env->me_mapsize == head->mm_mapsize) - /* LY: nothing to do */ - return MDB_SUCCESS; - - if (force || head->mm_mapsize != env->me_mapsize - || (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)) - flags &= MDB_WRITEMAP; - - /* LY: early sync before acquiring the mutex to reduce writer's latency */ - if (env->me_sync_pending > env->me_psize * 16 && (flags & MDB_NOSYNC) == 0) { - if (flags & MDB_WRITEMAP) { - size_t used_size = env->me_psize * (head->mm_last_pg + 1); - rc = msync(env->me_map, used_size, - (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC); - } else { - rc = fdatasync(env->me_fd); - } - if (unlikely(rc)) - return errno; - } - - mutex = MDB_MUTEX(env, w); - rc = mdb_mutex_lock(env, mutex); - if (unlikely(rc)) - return rc; - - /* LY: head may be changed while the mutex has been acquired. */ - head = mdb_meta_head_w(env); - rc = MDB_SUCCESS; - if (META_IS_WEAK(head) || env->me_sync_pending != 0 - || env->me_mapsize != head->mm_mapsize) { - MDB_meta meta = *head; - rc = mdb_env_sync0(env, flags, &meta); - } - - mdb_mutex_unlock(env, mutex); - return rc; -} - -/** Back up parent txn's cursors, then grab the originals for tracking */ -static int -mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) -{ - MDB_cursor *mc, *bk; - MDB_xcursor *mx; - size_t size; - int i; - - for (i = src->mt_numdbs; --i >= 0; ) { - if ((mc = src->mt_cursors[i]) != NULL) { - size = sizeof(MDB_cursor); - if (mc->mc_xcursor) - size += sizeof(MDB_xcursor); - for (; mc; mc = bk->mc_next) { - bk = malloc(size); - if (unlikely(!bk)) - return ENOMEM; - *bk = *mc; - mc->mc_backup = bk; - mc->mc_db = &dst->mt_dbs[i]; - /* Kill pointers into src to reduce abuse: The - * user may not use mc until dst ends. But we need a valid - * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = dst; - mc->mc_dbflag = &dst->mt_dbflags[i]; - if ((mx = mc->mc_xcursor) != NULL) { - *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = dst; - } - mc->mc_next = dst->mt_cursors[i]; - dst->mt_cursors[i] = mc; - } - } - } - return MDB_SUCCESS; -} - -/** Close this write txn's cursors, give parent txn's cursors back to parent. - * @param[in] txn the transaction handle. - * @param[in] merge true to keep changes to parent cursors, false to revert. - * @return 0 on success, non-zero on failure. - */ -static void -mdb_cursors_eot(MDB_txn *txn, unsigned merge) -{ - MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDB_xcursor *mx; - int i; - - for (i = txn->mt_numdbs; --i >= 0; ) { - for (mc = cursors[i]; mc; mc = next) { - unsigned stage = mc->mc_signature; - mdb_ensure(NULL, stage == MDBX_MC_SIGNATURE - || stage == MDBX_MC_WAIT4EOT); - next = mc->mc_next; - if ((bk = mc->mc_backup) != NULL) { - if (merge) { - /* Commit changes to parent txn */ - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbflag = bk->mc_dbflag; - if ((mx = mc->mc_xcursor) != NULL) - mx->mx_cursor.mc_txn = bk->mc_txn; - } else { - /* Abort nested txn */ - *mc = *bk; - if ((mx = mc->mc_xcursor) != NULL) - *mx = *(MDB_xcursor *)(bk+1); - } -#if MDBX_MODE_ENABLED - bk->mc_signature = 0; - free(bk); - } - if (stage == MDBX_MC_WAIT4EOT) { - mc->mc_signature = 0; - free(mc); - } else { - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0 /* reset C_UNTRACK */; - } -#else - mc = bk; - } - /* Only malloced cursors are permanently tracked. */ - mc->mc_signature = 0; - free(mc); -#endif - } - cursors[i] = NULL; - } -} - -/** Set or check a pid lock. Set returns 0 on success. - * Check returns 0 if the process is certainly dead, nonzero if it may - * be alive (the lock exists or an error happened so we do not know). - */ -static int -mdb_reader_pid(MDB_env *env, int op, pid_t pid) -{ - for (;;) { - int rc; - struct flock lock_info; - memset(&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = pid; - lock_info.l_len = 1; - if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { - if (op == F_GETLK && lock_info.l_type != F_UNLCK) - rc = -1; - } else if ((rc = errno) == EINTR) { - continue; - } - return rc; - } -} - -/** Common code for #mdb_txn_begin() and #mdb_txn_renew(). - * @param[in] txn the transaction handle to initialize - * @return 0 on success, non-zero on failure. - */ -static int -mdb_txn_renew0(MDB_txn *txn, unsigned flags) -{ - MDB_env *env = txn->mt_env; - unsigned i, nr; - int rc, new_notls = 0; - - if (unlikely(env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - if (flags & MDB_TXN_RDONLY) { - MDBX_rthc *rthc = NULL; - MDB_reader *r = NULL; - - txn->mt_flags = MDB_TXN_RDONLY; - if (likely(env->me_flags & MDB_ENV_TXKEY)) { - mdb_assert(env, !(env->me_flags & MDB_NOTLS)); - rthc = mdbx_rthc_get(env->me_txkey); - if (unlikely(! rthc)) - return ENOMEM; - if (likely(rthc->rc_reader)) { - r = rthc->rc_reader; - mdb_assert(env, r->mr_pid == env->me_pid); - mdb_assert(env, r->mr_tid == pthread_self()); - } - } else { - mdb_assert(env, env->me_flags & MDB_NOTLS); - r = txn->mt_u.reader; - } - - if (likely(r)) { - if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) - return MDB_BAD_RSLOT; - } else { - pid_t pid = env->me_pid; - pthread_t tid = pthread_self(); - pthread_mutex_t *rmutex = MDB_MUTEX(env, r); - - rc = mdb_mutex_lock(env, rmutex); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - - if (unlikely(!env->me_live_reader)) { - rc = mdb_reader_pid(env, F_SETLK, pid); - if (unlikely(rc != MDB_SUCCESS)) { - mdb_mutex_unlock(env, rmutex); - return rc; - } - env->me_live_reader = 1; - } - - nr = env->me_txns->mti_numreaders; - for (i=0; ime_txns->mti_readers[i].mr_pid == 0) - break; - if (unlikely(i == env->me_maxreaders)) { - mdb_mutex_unlock(env, rmutex); - return MDB_READERS_FULL; - } - r = &env->me_txns->mti_readers[i]; - /* Claim the reader slot, carefully since other code - * uses the reader table un-mutexed: First reset the - * slot, next publish it in mti_numreaders. After - * that, it is safe for mdb_env_close() to touch it. - * When it will be closed, we can finally claim it. */ - r->mr_pid = 0; - r->mr_txnid = ~(txnid_t)0; - r->mr_tid = tid; - mdbx_coherent_barrier(); -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - if (i == nr) - env->me_txns->mti_numreaders = ++nr; - if (env->me_close_readers < nr) - env->me_close_readers = nr; - r->mr_pid = pid; -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - mdb_mutex_unlock(env, rmutex); - - new_notls = MDB_END_SLOT; - if (likely(rthc)) { - rthc->rc_reader = r; - new_notls = 0; - } - } - - while((env->me_flags & MDB_FATAL_ERROR) == 0) { - MDB_meta *meta = mdb_meta_head_r(txn->mt_env); - txnid_t lead = meta->mm_txnid; - r->mr_txnid = lead; - mdbx_coherent_barrier(); - - txnid_t snap = txn->mt_env->me_txns->mti_txnid; - /* LY: Retry on a race, ITS#7970. */ - if (likely(lead == snap)) { - txn->mt_txnid = lead; - txn->mt_next_pgno = meta->mm_last_pg+1; - /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); -#if MDBX_MODE_ENABLED - txn->mt_canary = meta->mm_canary; -#endif - break; - } - } - - txn->mt_u.reader = r; - txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - } else { - /* Not yet touching txn == env->me_txn0, it may be active */ - rc = mdb_mutex_lock(env, MDB_MUTEX(env, w)); - if (unlikely(rc)) - return rc; - -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - MDB_meta *meta = mdb_meta_head_w(env); -#if MDBX_MODE_ENABLED - txn->mt_canary = meta->mm_canary; -#endif - txn->mt_txnid = meta->mm_txnid + 1; - txn->mt_flags = flags; -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - -#if MDB_DEBUG - if (unlikely(txn->mt_txnid == mdb_debug_edge)) { - if (! mdb_debug_logger) - mdb_runtime_flags |= MDBX_DBG_TRACE | MDBX_DBG_EXTRA - | MDBX_DBG_AUDIT | MDBX_DBG_ASSERT; - mdb_debug_log(MDBX_DBG_EDGE, __FUNCTION__, __LINE__, - "on/off edge (txn %zu)", txn->mt_txnid); - } -#endif - txn->mt_child = NULL; - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - txn->mt_dirty_room = MDB_IDL_UM_MAX; - txn->mt_u.dirty_list = env->me_dirty_list; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_free_pgs = env->me_free_pgs; - txn->mt_free_pgs[0] = 0; - txn->mt_spill_pgs = NULL; - if (txn->mt_lifo_reclaimed) - txn->mt_lifo_reclaimed[0] = 0; - env->me_txn = txn; - memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned)); - /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); - /* Moved to here to avoid a data race in read TXNs */ - txn->mt_next_pgno = meta->mm_last_pg+1; - } - - /* Setup db info */ - txn->mt_numdbs = env->me_numdbs; - for (i=CORE_DBS; imt_numdbs; i++) { - unsigned x = env->me_dbflags[i]; - txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; - txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; - } - txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; - txn->mt_dbflags[FREE_DBI] = DB_VALID; - - if (unlikely(env->me_flags & MDB_FATAL_ERROR)) { - mdb_debug("environment had fatal error, must shutdown!"); - rc = MDB_PANIC; - } else if (unlikely(env->me_maxpg < txn->mt_next_pgno)) { - rc = MDB_MAP_RESIZED; - } else { - return MDB_SUCCESS; - } - mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); - return rc; -} - -int -mdb_txn_renew(MDB_txn *txn) -{ - int rc; - - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED))) - return EINVAL; - - rc = mdb_txn_renew0(txn, MDB_TXN_RDONLY); - if (rc == MDB_SUCCESS) { - mdb_debug("renew txn %zu%c %p on mdbenv %p, root page %zu", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); - } - return rc; -} - -int -mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **ret) -{ - MDB_txn *txn; - MDB_ntxn *ntxn; - int rc, size, tsize; - - if (unlikely(!env || !ret)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - flags &= MDB_TXN_BEGIN_FLAGS; - flags |= env->me_flags & MDB_WRITEMAP; - - if (unlikely(env->me_flags & MDB_RDONLY & ~flags)) /* write txn in RDONLY env */ - return EACCES; - - if (parent) { - if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) - return EINVAL; - - /* Nested transactions: Max 1 child, write txns only, no writemap */ - flags |= parent->mt_flags; - if (unlikely(flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED))) { - return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; - } - /* Child txns save MDB_pgstate and use own copy of cursors */ - size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); - size += tsize = sizeof(MDB_ntxn); - } else if (flags & MDB_RDONLY) { - size = env->me_maxdbs * (sizeof(MDB_db)+1); - size += tsize = sizeof(MDB_txn); - } else { - /* Reuse preallocated write txn. However, do not touch it until - * mdb_txn_renew0() succeeds, since it currently may be active. */ - txn = env->me_txn0; - goto renew; - } - if (unlikely((txn = calloc(1, size)) == NULL)) { - mdb_debug("calloc: %s", strerror(errno)); - return ENOMEM; - } - txn->mt_dbxs = env->me_dbxs; /* static */ - txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); - txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; - txn->mt_flags = flags; - txn->mt_env = env; - - if (parent) { - unsigned i; - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); - if (!txn->mt_u.dirty_list || - !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) - { - free(txn->mt_u.dirty_list); - free(txn); - return ENOMEM; - } - txn->mt_txnid = parent->mt_txnid; - txn->mt_dirty_room = parent->mt_dirty_room; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_spill_pgs = NULL; - txn->mt_next_pgno = parent->mt_next_pgno; - parent->mt_flags |= MDB_TXN_HAS_CHILD; - parent->mt_child = txn; - txn->mt_parent = parent; - txn->mt_numdbs = parent->mt_numdbs; - memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - /* Copy parent's mt_dbflags, but clear DB_NEW */ - for (i=0; imt_numdbs; i++) - txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; - rc = 0; - ntxn = (MDB_ntxn *)txn; - ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ - if (env->me_pghead) { - size = MDB_IDL_SIZEOF(env->me_pghead); - env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); - if (likely(env->me_pghead)) - memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); - else - rc = ENOMEM; - } - if (likely(!rc)) - rc = mdb_cursor_shadow(parent, txn); - if (unlikely(rc)) - mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); - } else { /* MDB_RDONLY */ - txn->mt_dbiseqs = env->me_dbiseqs; -renew: - rc = mdb_txn_renew0(txn, flags); - } - if (unlikely(rc)) { - if (txn != env->me_txn0) - free(txn); - } else { - txn->mt_signature = MDBX_MT_SIGNATURE; - *ret = txn; - mdb_debug("begin txn %zu%c %p on mdbenv %p, root page %zu", - txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', - (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root); - } - - return rc; -} - -MDB_env * -mdb_txn_env(MDB_txn *txn) -{ - if(unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return NULL; - return txn->mt_env; -} - -size_t -mdb_txn_id(MDB_txn *txn) -{ - if(unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return 0; - return txn->mt_txnid; -} - -/** Export or close DBI handles opened in this txn. */ -static void -mdb_dbis_update(MDB_txn *txn, int keep) -{ - int i; - MDB_dbi n = txn->mt_numdbs; - MDB_env *env = txn->mt_env; - unsigned char *tdbflags = txn->mt_dbflags; - - for (i = n; --i >= CORE_DBS;) { - if (tdbflags[i] & DB_NEW) { - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; - } else { - char *ptr = env->me_dbxs[i].md_name.mv_data; - if (ptr) { - env->me_dbxs[i].md_name.mv_data = NULL; - env->me_dbxs[i].md_name.mv_size = 0; - env->me_dbflags[i] = 0; - env->me_dbiseqs[i]++; - free(ptr); - } - } - } - } - if (keep && env->me_numdbs < n) - env->me_numdbs = n; -} - -/** End a transaction, except successful commit of a nested transaction. - * May be called twice for readonly txns: First reset it, then abort. - * @param[in] txn the transaction handle to end - * @param[in] mode why and how to end the transaction - */ -static int -mdb_txn_end(MDB_txn *txn, unsigned mode) -{ - MDB_env *env = txn->mt_env; - static const char *const names[] = MDB_END_NAMES; - - if (unlikely(txn->mt_env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - /* Export or close DBI handles opened in this txn */ - mdb_dbis_update(txn, mode & MDB_END_UPDATE); - - mdb_debug("%s txn %zu%c %p on mdbenv %p, root page %zu", - names[mode & MDB_END_OPMASK], - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); - - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (txn->mt_u.reader) { -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - txn->mt_u.reader->mr_txnid = ~(txnid_t)0; - if (!(env->me_flags & MDB_NOTLS)) { - txn->mt_u.reader = NULL; /* txn does not own reader */ - } else if (mode & MDB_END_SLOT) { - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; - } /* else txn owns the slot until it does MDB_END_SLOT */ -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - } - mdbx_coherent_barrier(); - txn->mt_numdbs = 0; /* prevent further DBI activity */ - txn->mt_flags |= MDB_TXN_FINISHED; - - } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { - pgno_t *pghead = env->me_pghead; - - if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ - mdb_cursors_eot(txn, 0); - if (!(env->me_flags & MDB_WRITEMAP)) { - mdb_dlist_free(txn); - } - - if (txn->mt_lifo_reclaimed) { - txn->mt_lifo_reclaimed[0] = 0; - if (txn != env->me_txn0) { - mdb_midl_free(txn->mt_lifo_reclaimed); - txn->mt_lifo_reclaimed = NULL; - } - } - txn->mt_numdbs = 0; - txn->mt_flags = MDB_TXN_FINISHED; - - if (!txn->mt_parent) { - mdb_midl_shrink(&txn->mt_free_pgs); - env->me_free_pgs = txn->mt_free_pgs; - /* me_pgstate: */ - env->me_pghead = NULL; - env->me_pglast = 0; - - env->me_txn = NULL; - mode = 0; /* txn == env->me_txn0, do not free() it */ - - /* The writer mutex was locked in mdb_txn_begin. */ - mdb_mutex_unlock(env, MDB_MUTEX(env, w)); - } else { - txn->mt_parent->mt_child = NULL; - txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; - env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; - mdb_midl_free(txn->mt_free_pgs); - mdb_midl_free(txn->mt_spill_pgs); - free(txn->mt_u.dirty_list); - } - - mdb_midl_free(pghead); - } - - if (mode & MDB_END_FREE) { - txn->mt_signature = 0; - free(txn); - } - - return MDB_SUCCESS; -} - -int -mdb_txn_reset(MDB_txn *txn) -{ - if (unlikely(! txn)) - return EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - /* This call is only valid for read-only txns */ - if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) - return EINVAL; - -#if MDBX_MODE_ENABLED - /* LY: don't close DBI-handles in MDBX mode */ - return mdb_txn_end(txn, MDB_END_RESET|MDB_END_UPDATE); -#else - return mdb_txn_end(txn, MDB_END_RESET); -#endif /* MDBX_MODE_ENABLED */ -} - -int -mdb_txn_abort(MDB_txn *txn) -{ - if (unlikely(! txn)) - return EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - -#if MDBX_MODE_ENABLED - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - /* LY: don't close DBI-handles in MDBX mode */ - return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE); -#endif /* MDBX_MODE_ENABLED */ - - if (txn->mt_child) - mdb_txn_abort(txn->mt_child); - - return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); -} - -static MDBX_INLINE int -mdb_backlog_size(MDB_txn *txn) -{ - int reclaimed = txn->mt_env->me_pghead ? txn->mt_env->me_pghead[0] : 0; - return reclaimed + txn->mt_loose_count; -} - -/* LY: Prepare a backlog of pages to modify FreeDB itself, - * while reclaiming is prohibited. It should be enough to prevent search - * in mdb_page_alloc() during a deleting, when freeDB tree is unbalanced. */ -static int -mdb_prep_backlog(MDB_txn *txn, MDB_cursor *mc) -{ - /* LY: extra page(s) for b-tree rebalancing */ - const int extra = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1; - - if (mdb_backlog_size(txn) < mc->mc_db->md_depth + extra) { - int rc = mdb_cursor_touch(mc); - if (unlikely(rc)) - return rc; - - while (unlikely(mdb_backlog_size(txn) < extra)) { - rc = mdb_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); - if (unlikely(rc)) { - if (unlikely(rc != MDB_NOTFOUND)) - return rc; - break; - } - } - } - - return MDB_SUCCESS; -} - -/** Save the freelist as of this transaction to the freeDB. - * This changes the freelist. Keep trying until it stabilizes. - */ -static int -mdb_freelist_save(MDB_txn *txn) -{ - /* env->me_pghead[] can grow and shrink during this call. - * env->me_pglast and txn->mt_free_pgs[] can only grow. - * Page numbers cannot disappear from txn->mt_free_pgs[]. */ - MDB_cursor mc; - MDB_env *env = txn->mt_env; - int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; - txnid_t pglast = 0, head_id = 0; - pgno_t freecnt = 0, *free_pgs, *mop; - ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; - unsigned cleanup_idx = 0, refill_idx = 0; - const int lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; - - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - - /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ - clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) - ? SSIZE_MAX : maxfree_1pg; - -again: - for (;;) { - /* Come back here after each Put() in case freelist changed */ - MDB_val key, data; - pgno_t *pgs; - ssize_t j; - - if (! lifo) { - /* If using records from freeDB which we have not yet - * deleted, delete them and any we reserved for me_pghead. */ - while (pglast < env->me_pglast) { - rc = mdb_cursor_first(&mc, &key, NULL); - if (unlikely(rc)) - goto bailout; - rc = mdb_prep_backlog(txn, &mc); - if (unlikely(rc)) - goto bailout; - pglast = head_id = *(txnid_t *)key.mv_data; - total_room = head_room = 0; - more = 1; - mdb_tassert(txn, pglast <= env->me_pglast); - mc.mc_flags |= C_RECLAIMING; - rc = mdb_cursor_del(&mc, 0); - mc.mc_flags &= ~C_RECLAIMING; - if (unlikely(rc)) - goto bailout; - } - } else if (txn->mt_lifo_reclaimed) { - /* LY: cleanup reclaimed records. */ - while(cleanup_idx < txn->mt_lifo_reclaimed[0]) { - pglast = txn->mt_lifo_reclaimed[++cleanup_idx]; - key.mv_data = &pglast; - key.mv_size = sizeof(pglast); - rc = mdb_cursor_get(&mc, &key, NULL, MDB_SET); - if (likely(rc != MDB_NOTFOUND)) { - if (unlikely(rc)) - goto bailout; - rc = mdb_prep_backlog(txn, &mc); - if (unlikely(rc)) - goto bailout; - mc.mc_flags |= C_RECLAIMING; - rc = mdb_cursor_del(&mc, 0); - mc.mc_flags &= ~C_RECLAIMING; - if (unlikely(rc)) - goto bailout; - } - } - } - - if (unlikely(!env->me_pghead) && txn->mt_loose_pgs) { - /* Put loose page numbers in mt_free_pgs, since - * we may be unable to return them to me_pghead. */ - MDB_page *mp = txn->mt_loose_pgs; - if (unlikely((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)) - return rc; - for (; mp; mp = NEXT_LOOSE_PAGE(mp)) - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - } - - /* Save the IDL of pages freed by this txn, to a single record */ - if (freecnt < txn->mt_free_pgs[0]) { - if (unlikely(!freecnt)) { - /* Make sure last page of freeDB is touched and on freelist */ - rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); - if (unlikely(rc && rc != MDB_NOTFOUND)) - goto bailout; - } - free_pgs = txn->mt_free_pgs; - /* Write to last page of freeDB */ - key.mv_size = sizeof(txn->mt_txnid); - key.mv_data = &txn->mt_txnid; - do { - freecnt = free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (unlikely(rc)) - goto bailout; - /* Retry if mt_free_pgs[] grew during the Put() */ - free_pgs = txn->mt_free_pgs; - } while (freecnt < free_pgs[0]); - - mdb_midl_sort(free_pgs); - memcpy(data.mv_data, free_pgs, data.mv_size); - - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) { - unsigned i = free_pgs[0]; - mdb_debug_extra("IDL write txn %zu root %zu num %u, IDL", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); - for (; i; i--) - mdb_debug_extra_print(" %zu", free_pgs[i]); - mdb_debug_extra_print("\n"); - } - continue; - } - - mop = env->me_pghead; - mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; - - if (mop_len && refill_idx == 0) - refill_idx = 1; - - /* Reserve records for me_pghead[]. Split it if multi-page, - * to avoid searching freeDB for a page range. Use keys in - * range [1,me_pglast]: Smaller than txnid of oldest reader. */ - if (total_room >= mop_len) { - if (total_room == mop_len || --more < 0) - break; - } else if (head_room >= maxfree_1pg && head_id > 1) { - /* Keep current record (overflow page), add a new one */ - head_id--; - refill_idx++; - head_room = 0; - } - - if (lifo) { - if (refill_idx > (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) { - /* LY: need just a txn-id for save page list. */ - rc = mdb_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); - if (likely(rc == 0)) - /* LY: ok, reclaimed from freedb. */ - continue; - if (unlikely(rc != MDB_NOTFOUND)) - /* LY: other troubles... */ - goto bailout; - - /* LY: freedb is empty, will look any free txn-id in high2low order. */ - if (unlikely(env->me_pglast < 1)) { - /* LY: not any txn in the past of freedb. */ - rc = MDB_MAP_FULL; - goto bailout; - } - - if (unlikely(! txn->mt_lifo_reclaimed)) { - txn->mt_lifo_reclaimed = mdb_midl_alloc(env->me_maxfree_1pg); - if (unlikely(! txn->mt_lifo_reclaimed)) { - rc = ENOMEM; - goto bailout; - } - } - /* LY: append the list. */ - rc = mdb_midl_append(&txn->mt_lifo_reclaimed, env->me_pglast - 1); - if (unlikely(rc)) - goto bailout; - --env->me_pglast; - /* LY: note that freeDB cleanup is not needed. */ - ++cleanup_idx; - } - head_id = txn->mt_lifo_reclaimed[refill_idx]; - } - - /* (Re)write {key = head_id, IDL length = head_room} */ - total_room -= head_room; - head_room = mop_len - total_room; - if (head_room > maxfree_1pg && head_id > 1) { - /* Overflow multi-page for part of me_pghead */ - head_room /= head_id; /* amortize page sizes */ - head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); - } else if (head_room < 0) { - /* Rare case, not bothering to delete this record */ - head_room = 0; - continue; - } - key.mv_size = sizeof(head_id); - key.mv_data = &head_id; - data.mv_size = (head_room + 1) * sizeof(pgno_t); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (unlikely(rc)) - goto bailout; - /* IDL is initially empty, zero out at least the length */ - pgs = (pgno_t *)data.mv_data; - j = head_room > clean_limit ? head_room : 0; - do { - pgs[j] = 0; - } while (--j >= 0); - total_room += head_room; - } - - mdb_tassert(txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - - /* Return loose page numbers to me_pghead, though usually none are - * left at this point. The pages themselves remain in dirty_list. */ - if (txn->mt_loose_pgs) { - MDB_page *mp = txn->mt_loose_pgs; - unsigned count = txn->mt_loose_count; - MDB_IDL loose; - /* Room for loose pages + temp IDL with same */ - if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) - goto bailout; - mop = env->me_pghead; - loose = mop + MDB_IDL_ALLOCLEN(mop) - count; - for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) - loose[ ++count ] = mp->mp_pgno; - loose[0] = count; - mdb_midl_sort(loose); - mdb_midl_xmerge(mop, loose); - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - mop_len = mop[0]; - } - - /* Fill in the reserved me_pghead records */ - rc = MDB_SUCCESS; - if (mop_len) { - MDB_val key, data; - - mop += mop_len; - if (! lifo) { - rc = mdb_cursor_first(&mc, &key, &data); - if (unlikely(rc)) - goto bailout; - } - - for(;;) { - txnid_t id; - ssize_t len; - MDB_ID save; - - if (! lifo) { - id = *(txnid_t *)key.mv_data; - mdb_tassert(txn, id <= env->me_pglast); - } else { - mdb_tassert(txn, refill_idx > 0 && refill_idx <= txn->mt_lifo_reclaimed[0]); - id = txn->mt_lifo_reclaimed[refill_idx--]; - key.mv_data = &id; - key.mv_size = sizeof(id); - rc = mdb_cursor_get(&mc, &key, &data, MDB_SET); - if (unlikely(rc)) - goto bailout; - } - mdb_tassert(txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - - len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; - mdb_tassert(txn, len >= 0); - if (len > mop_len) - len = mop_len; - data.mv_size = (len + 1) * sizeof(MDB_ID); - key.mv_data = &id; - key.mv_size = sizeof(id); - data.mv_data = mop -= len; - - save = mop[0]; - mop[0] = len; - rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); - mdb_tassert(txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - mop[0] = save; - if (unlikely(rc || (mop_len -= len) == 0)) - goto bailout; - - if (! lifo) { - rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT); - if (unlikely(rc)) - goto bailout; - } - } - } - -bailout: - if (txn->mt_lifo_reclaimed) { - mdb_tassert(txn, rc || cleanup_idx == txn->mt_lifo_reclaimed[0]); - if (rc == 0 && cleanup_idx != txn->mt_lifo_reclaimed[0]) { - mdb_tassert(txn, cleanup_idx < txn->mt_lifo_reclaimed[0]); - /* LY: zeroed cleanup_idx to force cleanup & refill created freeDB records. */ - cleanup_idx = 0; - /* LY: restart filling */ - refill_idx = total_room = head_room = 0; - more = 1; - goto again; - } - txn->mt_lifo_reclaimed[0] = 0; - if (txn != env->me_txn0) { - mdb_midl_free(txn->mt_lifo_reclaimed); - txn->mt_lifo_reclaimed = NULL; - } - } - - return rc; -} - -/** Flush (some) dirty pages to the map, after clearing their dirty flag. - * @param[in] txn the transaction that's being committed - * @param[in] keep number of initial pages in dirty_list to keep dirty. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_flush(MDB_txn *txn, int keep) -{ - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned psize = env->me_psize, j; - int i, pagecount = dl[0].mid, rc; - size_t size = 0, pos = 0; - pgno_t pgno = 0; - MDB_page *dp = NULL; - struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; - size_t next_pos = 1; /* impossible pos, so pos != next_pos */ - int n = 0; - - j = i = keep; - - if (env->me_flags & MDB_WRITEMAP) { - /* Clear dirty flags */ - while (++i <= pagecount) { - dp = dl[i].mptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE|P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[++j] = dl[i]; - continue; - } - dp->mp_flags &= ~P_DIRTY; - env->me_sync_pending += IS_OVERFLOW(dp) ? psize * dp->mp_pages : psize; - } - goto done; - } - - /* Write the pages */ - for (;;) { - if (++i <= pagecount) { - dp = dl[i].mptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE|P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[i].mid = 0; - continue; - } - pgno = dl[i].mid; - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; - pos = pgno * psize; - size = psize; - if (IS_OVERFLOW(dp)) size *= dp->mp_pages; - env->me_sync_pending += size; - } - /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ - if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { - if (n) { -retry: - /* Write previous page(s) */ - wres = pwritev(env->me_fd, iov, n, wpos); - if (unlikely(wres != wsize)) { - if (wres < 0) { - rc = errno; - if (rc == EINTR) - goto retry; - mdb_debug("Write error: %s", strerror(rc)); - } else { - rc = EIO; /* TODO: Use which error code? */ - mdb_debug("short write, filesystem full?"); - } - return rc; - } - n = 0; - } - if (i > pagecount) - break; - wpos = pos; - wsize = 0; - } - mdb_debug("committing page %zu", pgno); - next_pos = pos + size; - iov[n].iov_len = size; - iov[n].iov_base = (char *)dp; - wsize += size; - n++; - } - - mdb_invalidate_cache(env->me_map, txn->mt_next_pgno * env->me_psize); - - for (i = keep; ++i <= pagecount; ) { - dp = dl[i].mptr; - /* This is a page we skipped above */ - if (!dl[i].mid) { - dl[++j] = dl[i]; - dl[j].mid = dp->mp_pgno; - continue; - } - mdb_dpage_free(env, dp); - } - -done: - i--; - txn->mt_dirty_room += i - j; - dl[0].mid = j; - return MDB_SUCCESS; -} - -int -mdb_txn_commit(MDB_txn *txn) -{ - int rc; - unsigned i, end_mode; - MDB_env *env; - - if (unlikely(txn == NULL)) - return EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(txn->mt_env->me_pid != getpid())) { - txn->mt_env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - /* mdb_txn_end() mode for a commit which writes nothing */ - end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; - - if (txn->mt_child) { - rc = mdb_txn_commit(txn->mt_child); - txn->mt_child = NULL; - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - } - - env = txn->mt_env; - - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) { - goto done; - } - - if (unlikely(txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR))) { - mdb_debug("error flag is set, can't commit"); - if (txn->mt_parent) - txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - rc = MDB_BAD_TXN; - goto fail; - } - - if (txn->mt_parent) { - MDB_txn *parent = txn->mt_parent; - MDB_page **lp; - MDB_ID2L dst, src; - MDB_IDL pspill; - unsigned x, y, len, ps_len; - - /* Append our reclaim list to parent's */ - if (txn->mt_lifo_reclaimed) { - if (parent->mt_lifo_reclaimed) { - rc = mdb_midl_append_list(&parent->mt_lifo_reclaimed, txn->mt_lifo_reclaimed); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - mdb_midl_free(txn->mt_lifo_reclaimed); - } else - parent->mt_lifo_reclaimed = txn->mt_lifo_reclaimed; - txn->mt_lifo_reclaimed = NULL; - } - - /* Append our free list to parent's */ - rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - mdb_midl_free(txn->mt_free_pgs); - /* Failures after this must either undo the changes - * to the parent or set MDB_TXN_ERROR in the parent. */ - - parent->mt_next_pgno = txn->mt_next_pgno; - parent->mt_flags = txn->mt_flags; - - /* Merge our cursors into parent's and close them */ - mdb_cursors_eot(txn, 1); - - /* Update parent's DB table. */ - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; - parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; - for (i=CORE_DBS; imt_numdbs; i++) { - /* preserve parent's DB_NEW status */ - x = parent->mt_dbflags[i] & DB_NEW; - parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; - } - - dst = parent->mt_u.dirty_list; - src = txn->mt_u.dirty_list; - /* Remove anything in our dirty list from parent's spill list */ - if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { - x = y = ps_len; - pspill[0] = (pgno_t)-1; - /* Mark our dirty pages as deleted in parent spill list */ - for (i=0, len=src[0].mid; ++i <= len; ) { - MDB_ID pn = src[i].mid << 1; - while (pn > pspill[x]) - x--; - if (pn == pspill[x]) { - pspill[x] = 1; - y = --x; - } - } - /* Squash deleted pagenums if we deleted any */ - for (x=y; ++x <= ps_len; ) - if (!(pspill[x] & 1)) - pspill[++y] = pspill[x]; - pspill[0] = y; - } - - /* Remove anything in our spill list from parent's dirty list */ - if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { - for (i=1; i<=txn->mt_spill_pgs[0]; i++) { - MDB_ID pn = txn->mt_spill_pgs[i]; - if (pn & 1) - continue; /* deleted spillpg */ - pn >>= 1; - y = mdb_mid2l_search(dst, pn); - if (y <= dst[0].mid && dst[y].mid == pn) { - free(dst[y].mptr); - while (y < dst[0].mid) { - dst[y] = dst[y+1]; - y++; - } - dst[0].mid--; - } - } - } - - /* Find len = length of merging our dirty list with parent's */ - x = dst[0].mid; - dst[0].mid = 0; /* simplify loops */ - if (parent->mt_parent) { - len = x + src[0].mid; - y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; - for (i = x; y && i; y--) { - pgno_t yp = src[y].mid; - while (yp < dst[i].mid) - i--; - if (yp == dst[i].mid) { - i--; - len--; - } - } - } else { /* Simplify the above for single-ancestor case */ - len = MDB_IDL_UM_MAX - txn->mt_dirty_room; - } - /* Merge our dirty list with parent's */ - y = src[0].mid; - for (i = len; y; dst[i--] = src[y--]) { - pgno_t yp = src[y].mid; - while (yp < dst[x].mid) - dst[i--] = dst[x--]; - if (yp == dst[x].mid) - free(dst[x--].mptr); - } - mdb_tassert(txn, i == x); - dst[0].mid = len; - free(txn->mt_u.dirty_list); - parent->mt_dirty_room = txn->mt_dirty_room; - if (txn->mt_spill_pgs) { - if (parent->mt_spill_pgs) { - /* TODO: Prevent failure here, so parent does not fail */ - rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); - if (unlikely(rc != MDB_SUCCESS)) - parent->mt_flags |= MDB_TXN_ERROR; - mdb_midl_free(txn->mt_spill_pgs); - mdb_midl_sort(parent->mt_spill_pgs); - } else { - parent->mt_spill_pgs = txn->mt_spill_pgs; - } - } - - /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) - ; - *lp = txn->mt_loose_pgs; - parent->mt_loose_count += txn->mt_loose_count; - - parent->mt_child = NULL; - mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); - txn->mt_signature = 0; - free(txn); - return rc; - } - - env = txn->mt_env; - if (unlikely(txn != env->me_txn)) { - mdb_debug("attempt to commit unknown transaction"); - rc = EINVAL; - goto fail; - } - - mdb_cursors_eot(txn, 0); - - if (!txn->mt_u.dirty_list[0].mid && - !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) - goto done; - - mdb_debug("committing txn %zu %p on mdbenv %p, root page %zu", - txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root); - - /* Update DB root pointers */ - if (txn->mt_numdbs > CORE_DBS) { - MDB_cursor mc; - MDB_dbi i; - MDB_val data; - data.mv_size = sizeof(MDB_db); - - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = CORE_DBS; i < txn->mt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - if (unlikely(TXN_DBI_CHANGED(txn, i))) { - rc = MDB_BAD_DBI; - goto fail; - } - data.mv_data = &txn->mt_dbs[i]; - rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, - F_SUBDATA); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - } - } - } - - rc = mdb_freelist_save(txn); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - - mdb_midl_free(env->me_pghead); - env->me_pghead = NULL; - mdb_midl_shrink(&txn->mt_free_pgs); - - if (mdb_audit_enabled()) - mdb_audit(txn); - - rc = mdb_page_flush(txn, 0); - if (likely(rc == MDB_SUCCESS)) { - MDB_meta meta; - - meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - meta.mm_last_pg = txn->mt_next_pgno - 1; - meta.mm_txnid = txn->mt_txnid; -#if MDBX_MODE_ENABLED - meta.mm_canary = txn->mt_canary; -#endif - - rc = mdb_env_sync0(env, env->me_flags | txn->mt_flags, &meta); - } - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; - -done: - return mdb_txn_end(txn, end_mode); - -fail: - mdb_txn_abort(txn); - return rc; -} - -/** Read the environment parameters of a DB environment before - * mapping it into memory. - * @param[in] env the environment handle - * @param[out] meta address of where to store the meta information - * @return 0 on success, non-zero on failure. - */ -static int __cold -mdb_env_read_header(MDB_env *env, MDB_meta *meta) -{ - MDB_metabuf pbuf; - MDB_page *p; - MDB_meta *m; - int i, rc, off; - enum { Size = sizeof(pbuf) }; - - /* We don't know the page size yet, so use a minimum value. - * Read both meta pages so we can use the latest one. - */ - - meta->mm_datasync_sign = MDB_DATASIGN_WEAK; - meta->mm_txnid = 0; - for (i=off=0; imm_psize) { - rc = pread(env->me_fd, &pbuf, Size, off); - if (rc != Size) { - if (rc == 0 && off == 0) - return ENOENT; - rc = rc < 0 ? (int) errno : MDB_INVALID; - mdb_debug("read: %s", mdb_strerror(rc)); - return rc; - } - - p = (MDB_page *)&pbuf; - - if (!F_ISSET(p->mp_flags, P_META)) { - mdb_debug("page %zu not a meta page", p->mp_pgno); - return MDB_INVALID; - } - - m = PAGEDATA(p); - if (m->mm_magic != MDB_MAGIC) { - mdb_debug("meta has invalid magic"); - return MDB_INVALID; - } - - if (m->mm_version != MDB_DATA_VERSION) { - mdb_debug("database is version %u, expected version %u", - m->mm_version, MDB_DATA_VERSION); - return MDB_VERSION_MISMATCH; - } - - if (m->mm_datasync_sign > MDB_DATASIGN_WEAK && m->mm_datasync_sign != mdb_meta_sign(m)) - continue; - - if (mdb_meta_lt(meta, m)) - *meta = *m; - } - - if (meta->mm_datasync_sign == MDB_DATASIGN_WEAK) - /* LY: Both meta-pages are weak. */ - return MDB_CORRUPTED; - - return MDB_SUCCESS; -} - -/** Fill in most of the zeroed #MDB_meta for an empty database environment */ -static void __cold -mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) -{ - meta->mm_magic = MDB_MAGIC; - meta->mm_version = MDB_DATA_VERSION; - meta->mm_mapsize = env->me_mapsize; - meta->mm_psize = env->me_psize; - meta->mm_last_pg = NUM_METAS-1; - meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ - meta->mm_dbs[FREE_DBI].md_root = P_INVALID; - meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; - meta->mm_datasync_sign = mdb_meta_sign(meta); -} - -/** Write the environment parameters of a freshly created DB environment. - * @param[in] env the environment handle - * @param[in] meta the #MDB_meta to write - * @return 0 on success, non-zero on failure. - */ -static int __cold -mdb_env_init_meta(MDB_env *env, MDB_meta *meta) -{ - MDB_page *p, *q; - int rc; - unsigned psize; - int len; - - mdb_debug("writing new meta page"); - - psize = env->me_psize; - - p = calloc(NUM_METAS, psize); - if (!p) - return ENOMEM; - p->mp_pgno = 0; - p->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(p) = *meta; - - q = (MDB_page *)((char *)p + psize); - q->mp_pgno = 1; - q->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(q) = *meta; - - do - len = pwrite(env->me_fd, p, psize * NUM_METAS, 0); - while (len == -1 && errno == EINTR); - - if (len < 0) - rc = errno; - else if ((unsigned) len == psize * NUM_METAS) - rc = MDB_SUCCESS; - else - rc = ENOSPC; - free(p); - return rc; -} - -static int -mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) -{ - int rc; - MDB_meta* head = mdb_meta_head_w(env); - size_t prev_mapsize = head->mm_mapsize; - size_t used_size = env->me_psize * (pending->mm_last_pg + 1); - - mdb_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); - mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); - mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 - || env->me_mapsize != prev_mapsize); - - pending->mm_mapsize = env->me_mapsize; - mdb_assert(env, pending->mm_mapsize >= used_size); - if (unlikely(pending->mm_mapsize != prev_mapsize)) { - if (pending->mm_mapsize < prev_mapsize) { - /* LY: currently this can't happen, but force full-sync. */ - flags &= MDB_WRITEMAP; - } else { - /* Persist any increases of mapsize config */ - } - } - - if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) - flags &= MDB_WRITEMAP; - - /* LY: step#1 - sync previously written/updated data-pages */ - if (env->me_sync_pending && (flags & MDB_NOSYNC) == 0) { - if (env->me_flags & MDB_WRITEMAP) { - int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - if (unlikely(msync(env->me_map, used_size, mode))) { - rc = errno; - /* LY: msync() should never return EINTR */ - goto fail; - } - if ((flags & MDB_MAPASYNC) == 0) - env->me_sync_pending = 0; - } else { - int (*flush)(int fd) = fdatasync; - if (unlikely(prev_mapsize != pending->mm_mapsize)) { - /* LY: It is no reason to use fdatasync() here, even in case - * no such bug in a kernel. Because "no-bug" mean that a kernel - * internally do nearly the same, e.g. fdatasync() == fsync() - * when no-kernel-bug and file size was changed. - * - * So, this code is always safe and without appreciable - * performance degradation. - * - * For more info about of a corresponding fdatasync() bug - * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - flush = fsync; - } - while(unlikely(flush(env->me_fd) < 0)) { - rc = errno; - if (rc != EINTR) - goto fail; - } - env->me_sync_pending = 0; - } - } - - /* LY: step#2 - update meta-page. */ - if (env->me_sync_pending == 0) { - pending->mm_datasync_sign = mdb_meta_sign(pending); - } else { - pending->mm_datasync_sign = - (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC - ? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK; - } - - volatile MDB_meta* target = (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) - ? head : mdb_env_meta_flipflop(env, head); - off_t offset = (char*) target - env->me_map; - - MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target); - mdb_debug("writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, txn_id %zu, %s", - offset >= env->me_psize, - target == head ? "head" : "tail", target->mm_txnid, - META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" : "Legacy", - stay == head ? "head" : "tail", stay->mm_txnid, - META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy", - pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid, - META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy" ); - - if (env->me_flags & MDB_WRITEMAP) { -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - /* LY: 'invalidate' the meta, - * but mdb_meta_head_r() will be confused/retired in collision case. */ - target->mm_datasync_sign = MDB_DATASIGN_WEAK; - target->mm_txnid = 0; - /* LY: update info */ - target->mm_mapsize = pending->mm_mapsize; - target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; - target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; - target->mm_last_pg = pending->mm_last_pg; -#if MDBX_MODE_ENABLED - target->mm_canary = pending->mm_canary; -#endif - /* LY: 'commit' the meta */ - target->mm_txnid = pending->mm_txnid; - target->mm_datasync_sign = pending->mm_datasync_sign; - } else { - pending->mm_magic = MDB_MAGIC; - pending->mm_version = MDB_DATA_VERSION; - pending->mm_address = head->mm_address; - retry: - rc = pwrite(env->me_fd, pending, sizeof(MDB_meta), offset); - if (unlikely(rc != sizeof(MDB_meta))) { - rc = (rc < 0) ? errno : EIO; - if (rc == EINTR) - goto retry; - - undo: - mdb_debug("write failed, disk error?"); - /* On a failure, the pagecache still contains the new data. - * Write some old data back, to prevent it from being used. */ - if (pwrite(env->me_fd, (void*) target, sizeof(MDB_meta), offset) == sizeof(MDB_meta)) { - /* LY: take a chance, if write succeeds at a magic ;) */ - goto retry; - } - goto fail; - } - mdb_invalidate_cache(env->me_map + offset, sizeof(MDB_meta)); -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - } - - /* Memory ordering issues are irrelevant; since the entire writer - * is wrapped by wmutex, all of these changes will become visible - * after the wmutex is unlocked. Since the DB is multi-version, - * readers will get consistent data regardless of how fresh or - * how stale their view of these values is. - */ - env->me_txns->mti_txnid = pending->mm_txnid; -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - - /* LY: step#3 - sync meta-pages. */ - if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { - if (env->me_flags & MDB_WRITEMAP) { - char* ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); - int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - if (unlikely(msync(ptr, env->me_os_psize, mode) < 0)) { - rc = errno; - goto fail; - } - } else { - while(unlikely(fdatasync(env->me_fd) < 0)) { - rc = errno; - if (rc != EINTR) - goto undo; - } - } - } - - /* LY: currently this can't happen, but... */ - if (unlikely(pending->mm_mapsize < prev_mapsize)) { - mdb_assert(env, pending->mm_mapsize == env->me_mapsize); - if (unlikely(mremap(env->me_map, prev_mapsize, pending->mm_mapsize, - MREMAP_FIXED, pending->mm_address) == MAP_FAILED)) { - rc = errno; - goto fail; - } - if (unlikely(ftruncate(env->me_fd, pending->mm_mapsize) < 0)) { - rc = errno; - goto fail; - } - } - - return MDB_SUCCESS; - -fail: - env->me_flags |= MDB_FATAL_ERROR; - return rc; -} - -int __cold -mdb_env_create(MDB_env **env) -{ - MDB_env *e; - - e = calloc(1, sizeof(MDB_env)); - if (!e) - return ENOMEM; - - e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = CORE_DBS; - e->me_fd = INVALID_HANDLE_VALUE; - e->me_lfd = INVALID_HANDLE_VALUE; - e->me_pid = getpid(); - GET_PAGESIZE(e->me_os_psize); - VALGRIND_CREATE_MEMPOOL(e,0,0); - e->me_signature = MDBX_ME_SIGNATURE; - *env = e; - return MDB_SUCCESS; -} - -static int __cold -mdb_env_map(MDB_env *env, void *addr, size_t usedsize) -{ - unsigned flags = env->me_flags; - - int prot = PROT_READ; - if (flags & MDB_WRITEMAP) { - prot |= PROT_WRITE; - if (ftruncate(env->me_fd, env->me_mapsize) < 0) - return errno; - } - - env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, env->me_fd, 0); - if (env->me_map == MAP_FAILED) { - env->me_map = NULL; - return errno; - } - - /* Can happen because the address argument to mmap() is just a - * hint. mmap() can pick another, e.g. if the range is in use. - * The MAP_FIXED flag would prevent that, but then mmap could - * instead unmap existing pages to make room for the new map. - */ - if (addr && env->me_map != addr) { - errno = 0; /* LY: clean errno as a hit for this case */ - return EBUSY; /* TODO: Make a new MDB_* error code? */ - } - - if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) - return errno; - -#ifdef MADV_NOHUGEPAGE - (void) madvise(env->me_map, env->me_mapsize, MADV_NOHUGEPAGE); -#endif - -#ifdef MADV_DONTDUMP - if (! (flags & MDBX_PAGEPERTURB)) { - (void) madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); - } -#endif - -#ifdef MADV_REMOVE - if (flags & MDB_WRITEMAP) { - (void) madvise(env->me_map + usedsize, env->me_mapsize - usedsize, MADV_REMOVE); - } -#endif - - /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ - if (madvise(env->me_map, env->me_mapsize, (flags & MDB_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) - return errno; - - /* Lock meta pages to avoid unexpected write, - * before the data pages would be synchronized. */ - if ((flags & MDB_WRITEMAP) && mlock(env->me_map, env->me_psize * 2)) - return errno; - -#ifdef USE_VALGRIND - env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "lmdb"); -#endif - - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_mapsize(MDB_env *env, size_t size) -{ - if (unlikely(!env)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(size < env->me_psize * 8)) - return EINVAL; - - /* If env is already open, caller is responsible for making - * sure there are no active txns. - */ - if (env->me_map) { - int rc; - MDB_meta *meta; - void *old; - if (env->me_txn) - return EINVAL; - meta = mdb_meta_head_w(env); - if (!size) - size = meta->mm_mapsize; - /* Silently round up to minimum if the size is too small */ - const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; - if (size < usedsize) - size = usedsize; - munmap(env->me_map, env->me_mapsize); -#ifdef USE_VALGRIND - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = -1; -#endif - env->me_mapsize = size; - old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; - rc = mdb_env_map(env, old, usedsize); - if (rc) - return rc; - } - env->me_mapsize = size; - if (env->me_psize) - env->me_maxpg = env->me_mapsize / env->me_psize; - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) -{ - if (unlikely(!env)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(env->me_map)) - return EINVAL; - - env->me_maxdbs = dbs + CORE_DBS; - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_maxreaders(MDB_env *env, unsigned readers) -{ - if (unlikely(!env || readers < 1)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(env->me_map)) - return EINVAL; - - env->me_maxreaders = readers; - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_maxreaders(MDB_env *env, unsigned *readers) -{ - if (!env || !readers) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - *readers = env->me_maxreaders; - return MDB_SUCCESS; -} - -static int __cold -mdb_fsize(HANDLE fd, size_t *size) -{ - struct stat st; - - if (fstat(fd, &st)) - return errno; - - *size = st.st_size; - return MDB_SUCCESS; -} - -/** Further setup required for opening an LMDB environment - */ -static int __cold -mdb_env_open2(MDB_env *env, MDB_meta *meta) -{ - unsigned flags = env->me_flags; - int i, newenv = 0, rc; - - if ((i = mdb_env_read_header(env, meta)) != 0) { - if (i != ENOENT) - return i; - mdb_debug("new mdbenv"); - newenv = 1; - env->me_psize = env->me_os_psize; - if (env->me_psize > MAX_PAGESIZE) - env->me_psize = MAX_PAGESIZE; - memset(meta, 0, sizeof(*meta)); - mdb_env_init_meta0(env, meta); - meta->mm_mapsize = DEFAULT_MAPSIZE; - } else { - env->me_psize = meta->mm_psize; - } - - /* Was a mapsize configured? */ - if (!env->me_mapsize) { - env->me_mapsize = meta->mm_mapsize; - } - { - /* Make sure mapsize >= committed data size. Even when using - * mm_mapsize, which could be broken in old files (ITS#7789). - */ - size_t minsize = (meta->mm_last_pg + 1) * meta->mm_psize; - if (env->me_mapsize < minsize) - env->me_mapsize = minsize; - } - meta->mm_mapsize = env->me_mapsize; - - if (newenv && !(flags & MDB_FIXEDMAP)) { - /* mdb_env_map() may grow the datafile. Write the metapages - * first, so the file will be valid if initialization fails. - * Except with FIXEDMAP, since we do not yet know mm_address. - * We could fill in mm_address later, but then a different - * program might end up doing that - one with a memory layout - * and map address which does not suit the main program. - */ - rc = mdb_env_init_meta(env, meta); - if (rc) - return rc; - newenv = 0; - } - - const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; - rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta->mm_address : NULL, usedsize); - if (rc) - return rc; - - if (newenv) { - if (flags & MDB_FIXEDMAP) - meta->mm_address = env->me_map; - i = mdb_env_init_meta(env, meta); - if (i != MDB_SUCCESS) { - return i; - } - } - - env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - - sizeof(indx_t); - env->me_maxkey_limit = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); - env->me_maxpg = env->me_mapsize / env->me_psize; - - if (MDB_MAXKEYSIZE > env->me_maxkey_limit) - return MDB_BAD_VALSIZE; - - return MDB_SUCCESS; -} - -/****************************************************************************/ - -#ifndef MDBX_USE_THREAD_ATEXIT -# if __GLIBC_PREREQ(2,18) -# define MDBX_USE_THREAD_ATEXIT 1 -# else -# define MDBX_USE_THREAD_ATEXIT 0 -# endif -#endif - -static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; -static MDBX_rthc *mdbx_rthc_list; -static pthread_key_t mdbx_pthread_crutch_key; - -static __inline -void mdbx_rthc_lock(void) { - mdb_ensure(NULL, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); -} - -static __inline -void mdbx_rthc_unlock(void) { - mdb_ensure(NULL, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); -} - -/** Release a reader thread's slot in the reader lock table. - * This function is called automatically when a thread exits. - * @param[in] ptr This points to the MDB_rthc of a slot in the reader lock table. - */ -static __cold -void mdbx_rthc_dtor(void) -{ - /* LY: Основная задача этого деструктора была и есть в освобождении - * слота таблицы читателей при завершении треда, но тут есть пара - * не очевидных сложностей: - * - Таблица читателей располагается в разделяемой памяти, поэтому - * во избежание segfault деструктор не должен что-либо делать после - * или одновременно с mdb_env_close(). - * - Действительно, mdb_env_close() вызовет pthread_key_delete() и - * после этого glibc не будет вызывать деструктор. - * - ОДНАКО, это никак не решает проблему гонок между mdb_env_close() - * и завершающимися тредами. Грубо говоря, при старте mdb_env_close() - * деструктор уже может выполняться в некоторых тредах, и завершиться - * эти выполнения могут во время или после окончания mdb_env_close(). - * - БОЛЕЕ ТОГО, схожая проблема возникает при выгрузке dso/dll, - * так как в текущей glibc (2.24) подсистема ld.so ничего не знает о - * TSD-деструкторах и поэтому может выгрузить lib.so до того как - * отработали все деструкторы. - * - Исходное проявление проблемы было зафиксировано - * в https://github.com/ReOpen/ReOpenLDAP/issues/48 - * - * Предыдущее решение посредством выделяемого динамически MDB_rthc - * было не удачным, так как порождало либо утечку памяти, - * либо вероятностное обращение к уже освобожденной памяти - * из этого деструктора. - * - * Текущее решение достаточно "развесисто", но решает все описанные выше - * проблемы без пенальти по производительности. - */ - - mdbx_rthc_lock(); - - pid_t pid = getpid(); - pthread_t thread = pthread_self(); - for (MDBX_rthc** ref = &mdbx_rthc_list; *ref; ) { - MDBX_rthc* rthc = *ref; - if (rthc->rc_thread == thread) { - if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - *ref = rthc->rc_next; - free(rthc); - } else { - ref = &(*ref)->rc_next; - } - } - - mdbx_rthc_unlock(); -} - -#if MDBX_USE_THREAD_ATEXIT - -extern void *__dso_handle __attribute__ ((__weak__)); -extern int __cxa_thread_atexit_impl(void (*dtor)(void*), void *obj, void *dso_symbol); - -static __cold -void mdbx_rthc__thread_atexit(void *ptr) { - mdb_ensure(NULL, ptr == pthread_getspecific(mdbx_pthread_crutch_key)); - mdb_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, NULL) == 0); - mdbx_rthc_dtor(); -} - -static __attribute__((constructor)) __cold -void mdbx_pthread_crutch_ctor(void) { - mdb_ensure(NULL, pthread_key_create( - &mdbx_pthread_crutch_key, NULL) == 0); -} - -#else /* MDBX_USE_THREAD_ATEXIT */ - -static __cold -void mdbx_rthc__thread_key_dtor(void *ptr) { - (void) ptr; - if (mdbx_pthread_crutch_key != (pthread_key_t) -1) - mdbx_rthc_dtor(); -} - -static __attribute__((constructor)) __cold -void mdbx_pthread_crutch_ctor(void) { - mdb_ensure(NULL, pthread_key_create( - &mdbx_pthread_crutch_key, mdbx_rthc__thread_key_dtor) == 0); -} - -static __attribute__((destructor)) __cold -void mdbx_pthread_crutch_dtor(void) -{ - pthread_key_delete(mdbx_pthread_crutch_key); - mdbx_pthread_crutch_key = -1; - - /* LY: Из-за race condition в pthread_key_delete() - * деструкторы уже могли начать выполняться. - * Уступая квант времени сразу после удаления ключа - * мы даем им шанс завершиться. */ - pthread_yield(); - - mdbx_rthc_lock(); - pid_t pid = getpid(); - while (mdbx_rthc_list != NULL) { - MDBX_rthc* rthc = mdbx_rthc_list; - mdbx_rthc_list = mdbx_rthc_list->rc_next; - if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - free(rthc); - - /* LY: Каждый неудаленный элемент списка - это один - * не отработавший деструктор и потенциальный - * шанс получить segfault после выгрузки lib.so - * Поэтому на каждой итерации уступаем квант времени, - * в надежде что деструкторы успеют отработать. */ - mdbx_rthc_unlock(); - pthread_yield(); - mdbx_rthc_lock(); - } - mdbx_rthc_unlock(); - pthread_yield(); -} -#endif /* MDBX_USE_THREAD_ATEXIT */ - -static __cold -MDBX_rthc* mdbx_rthc_add(pthread_key_t key) -{ - MDBX_rthc *rthc = malloc(sizeof(MDBX_rthc)); - if (unlikely(rthc == NULL)) - goto bailout; - - rthc->rc_next = NULL; - rthc->rc_reader = NULL; - rthc->rc_thread = pthread_self(); - if (unlikely(pthread_setspecific(key, rthc) != 0)) - goto bailout_free; - - mdbx_rthc_lock(); - if (pthread_getspecific(mdbx_pthread_crutch_key) == NULL) { -#if MDBX_USE_THREAD_ATEXIT - void *dso_anchor = (&__dso_handle && __dso_handle) - ? __dso_handle : (void *)mdb_version; - if (unlikely(__cxa_thread_atexit_impl(mdbx_rthc__thread_atexit, rthc, dso_anchor) != 0)) { - mdbx_rthc_unlock(); - goto bailout_free; - } -#endif /* MDBX_USE_THREAD_ATEXIT */ - mdb_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, rthc) == 0); - } - rthc->rc_next = mdbx_rthc_list; - mdbx_rthc_list = rthc; - mdbx_rthc_unlock(); - return rthc; - -bailout_free: - free(rthc); -bailout: - return NULL; -} - -static __inline -MDBX_rthc* mdbx_rthc_get(pthread_key_t key) -{ - MDBX_rthc *rthc = pthread_getspecific(key); - if (likely(rthc != NULL)) - return rthc; - return mdbx_rthc_add(key); -} - -static __cold -void mdbx_rthc_cleanup(MDB_env *env) -{ - mdbx_rthc_lock(); - - MDB_reader *begin = env->me_txns->mti_readers; - MDB_reader *end = begin + env->me_close_readers; - for (MDBX_rthc** ref = &mdbx_rthc_list; *ref; ) { - MDBX_rthc* rthc = *ref; - if (rthc->rc_reader >= begin && rthc->rc_reader < end) { - if (rthc->rc_reader->mr_pid == env->me_pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - *ref = rthc->rc_next; - free(rthc); - } else { - ref = &(*ref)->rc_next; - } - } - - mdbx_rthc_unlock(); -} - -/****************************************************************************/ - -/** Downgrade the exclusive lock on the region back to shared */ -static __cold -int mdb_env_share_locks(MDB_env *env, int *excl) -{ - struct flock lock_info; - int rc = 0; - - /* The shared lock replaces the existing lock */ - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_RDLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = errno) == EINTR) ; - *excl = rc ? -1 : 0; /* error may mean we lost the lock */ - - return rc; -} - -/** Try to get exclusive lock, otherwise shared. - * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. - */ -static int __cold -mdb_env_excl_lock(MDB_env *env, int *excl) -{ - int rc = 0; - struct flock lock_info; - - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = errno) == EINTR) ; - if (!rc) { - *excl = 1; - } else { - lock_info.l_type = F_RDLCK; - while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && - (rc = errno) == EINTR) ; - if (rc == 0) - *excl = 0; - } - return rc; -} - -#ifdef MDB_USE_HASH -/* - * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code - * - * @(#) $Revision: 5.1 $ - * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ - * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html - * - *** - * - * Please do not copyright this code. This code is in the public domain. - * - * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO - * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - * - * By: - * chongo /\oo/\ - * http://www.isthe.com/chongo/ - * - * Share and Enjoy! :-) - */ - -typedef unsigned long long mdb_hash_t; -#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) - -/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer - * @param[in] val value to hash - * @param[in] hval initial value for hash - * @return 64 bit hash - * - * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the - * hval arg on the first call. - */ -static mdb_hash_t -mdb_hash_val(MDB_val *val, mdb_hash_t hval) -{ - unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ - unsigned char *end = s + val->mv_size; - /* - * FNV-1a hash each octet of the string - */ - while (s < end) { - /* xor the bottom with the current octet */ - hval ^= (mdb_hash_t)*s++; - - /* multiply by the 64 bit FNV magic prime mod 2^64 */ - hval += (hval << 1) + (hval << 4) + (hval << 5) + - (hval << 7) + (hval << 8) + (hval << 40); - } - /* return our new hash value */ - return hval; -} - -/** Hash the string and output the encoded hash. - * This uses modified RFC1924 Ascii85 encoding to accommodate systems with - * very short name limits. We don't care about the encoding being reversible, - * we just want to preserve as many bits of the input as possible in a - * small printable string. - * @param[in] str string to hash - * @param[out] encbuf an array of 11 chars to hold the hash - */ -static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; - -static void __cold -mdb_pack85(unsigned long l, char *out) -{ - int i; - - for (i=0; i<5; i++) { - *out++ = mdb_a85[l % 85]; - l /= 85; - } -} - -static void __cold -mdb_hash_enc(MDB_val *val, char *encbuf) -{ - mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); - - mdb_pack85(h, encbuf); - mdb_pack85(h>>32, encbuf+5); - encbuf[10] = '\0'; -} -#endif - -/** Open and/or initialize the lock region for the environment. - * @param[in] env The LMDB environment. - * @param[in] lpath The pathname of the file used for the lock region. - * @param[in] mode The Unix permissions for the file, if we create it. - * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive - * @return 0 on success, non-zero on failure. - */ -static int __cold -mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) -{ - int fdflags; - int rc; - off_t size, rsize; - void *m; - - env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode); - if (env->me_lfd == INVALID_HANDLE_VALUE) { - rc = errno; - if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { - return MDB_SUCCESS; - } - return rc; - } - - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_lfd, F_SETFD, fdflags); - - if (!(env->me_flags & MDB_NOTLS)) { - rc = pthread_key_create(&env->me_txkey, NULL); - if (rc) - return rc; - env->me_flags |= MDB_ENV_TXKEY; - } - - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. - */ - if ((rc = mdb_env_excl_lock(env, excl))) return rc; - - size = lseek(env->me_lfd, 0, SEEK_END); - if (size == -1) return errno; - rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); - if (size < rsize && *excl > 0) { - if (ftruncate(env->me_lfd, rsize) != 0) return errno; - } else { - rsize = size; - size = rsize - sizeof(MDB_txninfo); - env->me_maxreaders = size/sizeof(MDB_reader) + 1; - } - - m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0); - if (m == MAP_FAILED) - return errno; - env->me_txns = m; - -#ifdef MADV_NOHUGEPAGE - (void) madvise(env->me_txns, rsize, MADV_NOHUGEPAGE); -#endif - -#ifdef MADV_DODUMP - (void) madvise(env->me_txns, rsize, MADV_DODUMP); -#endif - - if (madvise(env->me_txns, rsize, MADV_DONTFORK) < 0) - return errno; - - if (madvise(env->me_txns, rsize, MADV_WILLNEED) < 0) - return errno; - - if (madvise(env->me_txns, rsize, MADV_RANDOM) < 0) - return errno; - - if (*excl > 0) { - /* Solaris needs this before initing a robust mutex. Otherwise - * it may skip the init and return EBUSY "seems someone already - * inited" or EINVAL "it was inited differently". - */ - memset(&env->me_txns->mti_rmutex, 0, sizeof(env->me_txns->mti_rmutex)); - memset(&env->me_txns->mti_wmutex, 0, sizeof(env->me_txns->mti_wmutex)); - - pthread_mutexattr_t mattr; - rc = pthread_mutexattr_init(&mattr); - if (rc) return rc; - - rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); - -#if MDB_USE_ROBUST - if(! rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); -#endif /* MDB_USE_ROBUST */ - if (! rc) rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr); - if (! rc) rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); - - pthread_mutexattr_destroy(&mattr); - if (rc) return rc; - - env->me_txns->mti_magic = MDB_MAGIC; - env->me_txns->mti_format = MDB_LOCK_FORMAT; - env->me_txns->mti_txnid = ~0L; - env->me_txns->mti_numreaders = 0; - } else { - if (env->me_txns->mti_magic != MDB_MAGIC) { - mdb_debug("lock region has invalid magic"); - return MDB_INVALID; - } - if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { - mdb_debug("lock region has format+version 0x%x, expected 0x%x", - env->me_txns->mti_format, MDB_LOCK_FORMAT); - return MDB_VERSION_MISMATCH; - } - } - - return MDB_SUCCESS; -} - - /** The name of the lock file in the DB environment */ -#define LOCKNAME "/lock.mdb" - /** The name of the data file in the DB environment */ -#define DATANAME "/data.mdb" - /** The suffix of the lock file when no subdir is used */ -#define LOCKSUFF "-lock" - /** Only a subset of the @ref mdb_env flags can be changed - * at runtime. Changing other flags requires closing the - * environment and re-opening it with the new flags. - */ -#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC| \ - MDB_NOMEMINIT|MDBX_COALESCE|MDBX_PAGEPERTURB) -#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ - MDB_WRITEMAP|MDB_NOTLS|MDB_NORDAHEAD|MDBX_LIFORECLAIM) - -#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) -# error "Persistent DB flags & env flags overlap, but both go in mm_flags" -#endif - -MDBX_ONLY_FEATURE int __cold -mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive) -{ - int oflags, rc, len, excl = -1; - char *lpath, *dpath; - - if (unlikely(!env || !path)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (env->me_fd != INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) - return EINVAL; - - len = strlen(path); - if (flags & MDB_NOSUBDIR) { - rc = len + sizeof(LOCKSUFF) + len + 1; - } else { - rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); - } - lpath = malloc(rc); - if (!lpath) - return ENOMEM; - if (flags & MDB_NOSUBDIR) { - dpath = lpath + len + sizeof(LOCKSUFF); - sprintf(lpath, "%s" LOCKSUFF, path); - strcpy(dpath, path); - } else { - dpath = lpath + len + sizeof(LOCKNAME); - sprintf(lpath, "%s" LOCKNAME, path); - sprintf(dpath, "%s" DATANAME, path); - } - - rc = MDB_SUCCESS; - flags |= env->me_flags; - if (flags & MDB_RDONLY) { - /* LY: silently ignore irrelevant flags when we're only getting read access */ - flags &= ~(MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOSYNC | MDB_NOMETASYNC - | MDBX_COALESCE | MDBX_LIFORECLAIM | MDB_NOMEMINIT); - } else { - if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) - && (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) - rc = ENOMEM; - } - env->me_flags = flags |= MDB_ENV_ACTIVE; - if (rc) - goto leave; - - env->me_path = strdup(path); - env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); - env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); - env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned)); - if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { - rc = ENOMEM; - goto leave; - } - env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_int_ai; /* aligned MDB_INTEGERKEY */ - - /* For RDONLY, get lockfile after we know datafile exists */ - if (!(flags & MDB_RDONLY)) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto leave; - } - - if (F_ISSET(flags, MDB_RDONLY)) - oflags = O_RDONLY; - else - oflags = O_RDWR | O_CREAT; - - env->me_fd = open(dpath, oflags|O_CLOEXEC, mode); - if (env->me_fd == INVALID_HANDLE_VALUE) { - rc = errno; - goto leave; - } - - int fdflags; - if ((fdflags = fcntl(env->me_fd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_fd, F_SETFD, fdflags); - - if (flags & MDB_RDONLY) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto leave; - } - - MDB_meta meta; - if ((rc = mdb_env_open2(env, &meta)) == MDB_SUCCESS) { - mdb_debug("opened dbenv %p", (void *) env); - if (excl > 0) { - env->me_txns->mti_txnid = meta.mm_txnid; - if (exclusive == NULL || *exclusive < 2) { - /* LY: downgrade lock only if exclusive access not requested. - * in case exclusive==1, just leave value as is. */ - rc = mdb_env_share_locks(env, &excl); - if (rc) - goto leave; - } - } else if (exclusive) { - /* LY: just indicate that is not an exclusive access. */ - *exclusive = 0; - } - if (!(flags & MDB_RDONLY)) { - MDB_txn *txn; - int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * - (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned)+1); - if ((env->me_pbuf = calloc(1, env->me_psize)) && - (txn = calloc(1, size))) - { - txn->mt_dbs = (MDB_db *)((char *)txn + tsize); - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); - txn->mt_env = env; - txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDB_TXN_FINISHED; - env->me_txn0 = txn; - } else { - rc = ENOMEM; - } - } - } - -#if MDB_DEBUG - if (rc == MDB_SUCCESS) { - MDB_meta *meta = mdb_meta_head_r(env); - MDB_db *db = &meta->mm_dbs[MAIN_DBI]; - int toggle = ((char*) meta == PAGEDATA(env->me_map)) ? 0 : 1; - - mdb_debug("opened database version %u, pagesize %u", - meta->mm_version, env->me_psize); - mdb_debug("using meta page %d, txn %zu", toggle, meta->mm_txnid); - mdb_debug("depth: %u", db->md_depth); - mdb_debug("entries: %zu", db->md_entries); - mdb_debug("branch pages: %zu", db->md_branch_pages); - mdb_debug("leaf pages: %zu", db->md_leaf_pages); - mdb_debug("overflow pages: %zu", db->md_overflow_pages); - mdb_debug("root: %zu", db->md_root); - } -#endif - -leave: - if (rc) - mdb_env_close0(env); - free(lpath); - return rc; -} - -int __cold -mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode) -{ - return mdbx_env_open_ex(env, path, flags, mode, NULL); -} - -/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ -static void __cold -mdb_env_close0(MDB_env *env) -{ - int i; - - if (!(env->me_flags & MDB_ENV_ACTIVE)) - return; - env->me_flags &= ~MDB_ENV_ACTIVE; - - /* Doing this here since me_dbxs may not exist during mdb_env_close */ - if (env->me_dbxs) { - for (i = env->me_maxdbs; --i >= CORE_DBS; ) - free(env->me_dbxs[i].md_name.mv_data); - free(env->me_dbxs); - } - - free(env->me_pbuf); - free(env->me_dbiseqs); - free(env->me_dbflags); - free(env->me_path); - free(env->me_dirty_list); - if (env->me_txn0) - mdb_midl_free(env->me_txn0->mt_lifo_reclaimed); - free(env->me_txn0); - mdb_midl_free(env->me_free_pgs); - - if (env->me_flags & MDB_ENV_TXKEY) { - mdb_ensure(env, pthread_key_delete(env->me_txkey) == 0); - env->me_flags &= ~MDB_ENV_TXKEY; - } - - if (env->me_map) { - munmap(env->me_map, env->me_mapsize); -#ifdef USE_VALGRIND - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = -1; -#endif - } - if (env->me_fd != INVALID_HANDLE_VALUE) - (void) close(env->me_fd); - - /* Clearing readers is done in this function because - * me_txkey with its destructor must be disabled first. - * - * We skip the the reader mutex, so we touch only - * data owned by this process (me_close_readers and - * our readers), and clear each reader atomically. - */ - if (env->me_pid == getpid()) - mdbx_rthc_cleanup(env); - - munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); - env->me_txns = NULL; - env->me_pid = 0; - - if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void) close(env->me_lfd); - } -} - -MDBX_ONLY_FEATURE int __cold -mdbx_env_close_ex(MDB_env *env, int dont_sync) -{ - MDB_page *dp; - int rc = MDB_SUCCESS; - - if (unlikely(!env)) - return EINVAL; - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (! dont_sync && env->me_txns) - rc = mdb_env_sync(env, 1); - - VALGRIND_DESTROY_MEMPOOL(env); - while ((dp = env->me_dpages) != NULL) { - ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next)); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dpages = dp->mp_next; - free(dp); - } - - mdb_env_close0(env); - env->me_signature = 0; - free(env); - - return rc; -} - -void __cold -mdb_env_close(MDB_env *env) -{ - mdbx_env_close_ex(env, 0); -} - -/* LY: fast enough on most arches - * - * / - * | -1, a < b - * cmp2int(a,b) = < 0, a == b - * | 1, a > b - * \ - */ -#if 1 -# define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -# define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/** Compare two items pointing at aligned unsigned int's. */ -static int __hot -mdb_cmp_int_ai(const MDB_val *a, const MDB_val *b) -{ - mdb_assert(NULL, a->mv_size == b->mv_size); - mdb_assert(NULL, 0 == (uintptr_t) a->mv_data % sizeof(int) - && 0 == (uintptr_t) b->mv_data % sizeof(int)); - - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int( *(size_t *)a->mv_data, *(size_t *)b->mv_data ); - - mdb_assert(NULL, a->mv_size == sizeof(int) ); - return mdbx_cmp2int( *(unsigned *)a->mv_data, *(unsigned *)b->mv_data ); -} - -/** Compare two items pointing at 2-byte aligned unsigned int's. */ -static int __hot -mdb_cmp_int_a2(const MDB_val *a, const MDB_val *b) -{ - mdb_assert(NULL, a->mv_size == b->mv_size); - mdb_assert(NULL, 0 == (uintptr_t) a->mv_data % sizeof(uint16_t) - && 0 == (uintptr_t) b->mv_data % sizeof(uint16_t)); -#ifdef MISALIGNED_OK - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int( *(size_t *)a->mv_data, *(size_t *)b->mv_data ); - - mdb_assert(NULL, a->mv_size == sizeof(int) ); - return mdbx_cmp2int( *(unsigned *)a->mv_data, *(unsigned *)b->mv_data ); -#else - mdb_assert(NULL, 0 == a->mv_size % sizeof(uint16_t)); - { - int diff; - const uint16_t *pa, *pb, *end; - -#if BYTE_ORDER == LITTLE_ENDIAN - end = (const uint16_t *) a->mv_data; - pa = (const uint16_t *) ((char *) a->mv_data + a->mv_size); - pb = (const uint16_t *) ((char *) b->mv_data + a->mv_size); - do { - diff = *--pa - *--pb; -#else /* BYTE_ORDER */ - end = (const uint16_t *) ((char *) a->mv_data + a->mv_size); - pa = (const uint16_t *) a->mv_data; - pb = (const uint16_t *) b->mv_data; - do { - diff = *pa++ - *pb++; -#endif /* BYTE_ORDER */ - if (likely(diff != 0)) break; - } while(pa != end); - return diff; - } -#endif /* MISALIGNED_OK */ -} - -/** Compare two items pointing at unsigneds of unknown alignment. - * - * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp. - */ -static int __hot -mdb_cmp_int_ua(const MDB_val *a, const MDB_val *b) -{ - mdb_assert(NULL, a->mv_size == b->mv_size); -#if MISALIGNED_OK - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int( *(size_t *)a->mv_data, *(size_t *)b->mv_data ); - - mdb_assert(NULL, a->mv_size == sizeof(int) ); - return mdbx_cmp2int( *(unsigned *)a->mv_data, *(unsigned *)b->mv_data ); -#else - mdb_assert(NULL, a->mv_size == sizeof(int) || a->mv_size == sizeof(size_t)); -#if BYTE_ORDER == LITTLE_ENDIAN - { - int diff; - const uint8_t *pa, *pb; - - pa = (const uint8_t *)a->mv_data + a->mv_size; - pb = (const uint8_t *)b->mv_data + a->mv_size; - - do { - diff = *--pa - *--pb; - if (likely(diff != 0)) break; - } while(pa != a->mv_data); - return diff; - } -#else /* BYTE_ORDER */ - return memcmp(a->mv_data, b->mv_data, a->mv_size); -#endif /* BYTE_ORDER */ -#endif /* MISALIGNED_OK */ -} - -/** Compare two items lexically */ -static int __hot -mdb_cmp_memn(const MDB_val *a, const MDB_val *b) -{ - /* LY: assumes that length of keys are NOT equal for most cases, - * if no then branch-prediction should mitigate the problem */ -#if 0 - /* LY: without branch instructions on x86, - * but isn't best for equal length of keys */ - int diff_len = mdbx_cmp2int(a->mv_size, b->mv_size); -#else - /* LY: best when length of keys are equal, - * but got a branch-penalty otherwise */ - if (unlikely(a->mv_size == b->mv_size)) - return memcmp(a->mv_data, b->mv_data, a->mv_size); - int diff_len = (a->mv_size < b->mv_size) ? -1 : 1; -#endif - size_t shortest = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; - int diff_data = memcmp(a->mv_data, b->mv_data, shortest); - return likely(diff_data) ? diff_data : diff_len; -} - -/** Compare two items in reverse byte order */ -static int __hot -mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) -{ - const uint8_t *pa, *pb, *end; - - pa = (const uint8_t *)a->mv_data + a->mv_size; - pb = (const uint8_t *)b->mv_data + b->mv_size; - size_t minlen = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; - end = pa - minlen; - - while (pa != end) { - int diff = *--pa - *--pb; - if (likely(diff)) - return diff; - } - return mdbx_cmp2int(a->mv_size, b->mv_size); -} - -/** Search for key within a page, using binary search. - * Returns the smallest entry larger or equal to the key. - * If exactp is non-null, stores whether the found entry was an exact match - * in *exactp (1 or 0). - * Updates the cursor index with the index of the found entry. - * If no entry larger or equal to the key is found, returns NULL. - */ -static MDB_node * __hot -mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) -{ - unsigned i = 0, nkeys; - int low, high; - int rc = 0; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NULL; - MDB_val nodekey; - MDB_cmp_func *cmp; - DKBUF; - - nkeys = NUMKEYS(mp); - - mdb_debug("searching %u keys in %s %spage %zu", - nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp)); - - low = IS_LEAF(mp) ? 0 : 1; - high = nkeys - 1; - cmp = mc->mc_dbx->md_cmp; - - /* Branch pages have no data, so if using integer keys, - * alignment is guaranteed. Use faster mdb_cmp_int_ai. - */ - if (cmp == mdb_cmp_int_a2 && IS_BRANCH(mp)) - cmp = mdb_cmp_int_ai; - - if (IS_LEAF2(mp)) { - nodekey.mv_size = mc->mc_db->md_xsize; - node = NODEPTR(mp, 0); /* fake */ - while (low <= high) { - i = (low + high) >> 1; - nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); - rc = cmp(key, &nodekey); - mdb_debug("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc); - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } else { - while (low <= high) { - i = (low + high) >> 1; - - node = NODEPTR(mp, i); - nodekey.mv_size = NODEKSZ(node); - nodekey.mv_data = NODEKEY(node); - - rc = cmp(key, &nodekey); - if (IS_LEAF(mp)) - mdb_debug("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc); - else - mdb_debug("found branch index %u [%s -> %zu], rc = %i", - i, DKEY(&nodekey), NODEPGNO(node), rc); - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } - - if (rc > 0) { /* Found entry is less than the key. */ - i++; /* Skip to get the smallest entry larger than key. */ - if (!IS_LEAF2(mp)) - node = NODEPTR(mp, i); - } - if (exactp) - *exactp = (rc == 0 && nkeys > 0); - /* store the key index */ - mc->mc_ki[mc->mc_top] = i; - if (i >= nkeys) - /* There is no entry larger or equal to the key. */ - return NULL; - - /* nodeptr is fake for LEAF2 */ - return node; -} - -#if 0 -static void -mdb_cursor_adjust(MDB_cursor *mc, func) -{ - MDB_cursor *m2; - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { - func(mc, m2); - } - } -} -#endif - -/** Pop a page off the top of the cursor's stack. */ -static void -mdb_cursor_pop(MDB_cursor *mc) -{ - if (mc->mc_snum) { - mdb_debug("popped page %zu off db %d cursor %p", - mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc); - - mc->mc_snum--; - if (mc->mc_snum) { - mc->mc_top--; - } else { - mc->mc_flags &= ~C_INITIALIZED; - } - } -} - -/** Push a page onto the top of the cursor's stack. - * Set #MDB_TXN_ERROR on failure. - */ -static int -mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) -{ - mdb_debug("pushing page %zu on db %d cursor %p", mp->mp_pgno, - DDBI(mc), (void *) mc); - - if (unlikely(mc->mc_snum >= CURSOR_STACK)) { - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CURSOR_FULL; - } - - mc->mc_top = mc->mc_snum++; - mc->mc_pg[mc->mc_top] = mp; - mc->mc_ki[mc->mc_top] = 0; - - return MDB_SUCCESS; -} - -/** Find the address of the page corresponding to a given page number. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc the cursor accessing the page. - * @param[in] pgno the page number for the page to retrieve. - * @param[out] ret address of a pointer where the page's address will be stored. - * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) -{ - MDB_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; - MDB_page *p = NULL; - int level; - - if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { - MDB_txn *tx2 = txn; - level = 1; - do { - MDB_ID2L dl = tx2->mt_u.dirty_list; - unsigned x; - /* Spilled pages were dirtied in this txn and flushed - * because the dirty list got full. Bring this page - * back in from the map (but don't unspill it here, - * leave that unless page_touch happens again). */ - if (tx2->mt_spill_pgs) { - MDB_ID pn = pgno << 1; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) - goto mapped; - } - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - p = dl[x].mptr; - goto done; - } - } - level++; - } while ((tx2 = tx2->mt_parent) != NULL); - } - - if (unlikely(pgno >= txn->mt_next_pgno)) { - mdb_debug("page %zu not found", pgno); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_NOTFOUND; - } - level = 0; - -mapped: - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - -done: - *ret = p; - if (lvl) - *lvl = level; - return MDB_SUCCESS; -} - -/** Finish #mdb_page_search() / #mdb_page_search_lowest(). - * The cursor is at the root page, set up the rest of it. - */ -static int -mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int rc; - DKBUF; - - while (IS_BRANCH(mp)) { - MDB_node *node; - indx_t i; - - mdb_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); - /* Don't assert on branch pages in the FreeDB. We can get here - * while in the process of rebalancing a FreeDB branch page; we must - * let that proceed. ITS#8336 - */ - mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - mdb_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); - - if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { - i = 0; - if (flags & MDB_PS_LAST) { - i = NUMKEYS(mp) - 1; - /* if already init'd, see if we're already in right place */ - if (mc->mc_flags & C_INITIALIZED) { - if (mc->mc_ki[mc->mc_top] == i) { - mc->mc_top = mc->mc_snum++; - mp = mc->mc_pg[mc->mc_top]; - goto ready; - } - } - } - } else { - int exact; - node = mdb_node_search(mc, key, &exact); - if (node == NULL) - i = NUMKEYS(mp) - 1; - else { - i = mc->mc_ki[mc->mc_top]; - if (!exact) { - mdb_cassert(mc, i > 0); - i--; - } - } - mdb_debug("following index %u for key [%s]", i, DKEY(key)); - } - - mdb_cassert(mc, i < NUMKEYS(mp)); - node = NODEPTR(mp, i); - - if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) - return rc; - - mc->mc_ki[mc->mc_top] = i; - if (unlikely(rc = mdb_cursor_push(mc, mp))) - return rc; - -ready: - if (flags & MDB_PS_MODIFY) { - if (unlikely((rc = mdb_page_touch(mc)) != 0)) - return rc; - mp = mc->mc_pg[mc->mc_top]; - } - } - - if (unlikely(!IS_LEAF(mp))) { - mdb_debug("internal error, index points to a %02X page!?", - mp->mp_flags); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - - mdb_debug("found leaf page %zu for key [%s]", mp->mp_pgno, - key ? DKEY(key) : "null"); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - return MDB_SUCCESS; -} - -/** Search for the lowest key under the current branch page. - * This just bypasses a NUMKEYS check in the current page - * before calling mdb_page_search_root(), because the callers - * are all in situations where the current page is known to - * be underfilled. - */ -static int -mdb_page_search_lowest(MDB_cursor *mc) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NODEPTR(mp, 0); - int rc; - - if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) - return rc; - - mc->mc_ki[mc->mc_top] = 0; - if (unlikely(rc = mdb_cursor_push(mc, mp))) - return rc; - return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); -} - -/** Search for the page a given key should be in. - * Push it and its parent pages on the cursor stack. - * @param[in,out] mc the cursor for this operation. - * @param[in] key the key to search for, or NULL for first/last page. - * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB - * are touched (updated with new page numbers). - * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. - * This is used by #mdb_cursor_first() and #mdb_cursor_last(). - * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) -{ - int rc; - pgno_t root; - - /* Make sure the txn is still viable, then find the root from - * the txn's db table and set it as the root of the cursor's stack. - */ - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) { - mdb_debug("transaction has failed, must abort"); - return MDB_BAD_TXN; - } else { - /* Make sure we're using an up-to-date root */ - if (unlikely(*mc->mc_dbflag & DB_STALE)) { - MDB_cursor mc2; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) - return MDB_BAD_DBI; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); - if (rc) - return rc; - { - MDB_val data; - int exact = 0; - uint16_t flags; - MDB_node *leaf = mdb_node_search(&mc2, - &mc->mc_dbx->md_name, &exact); - if (!exact) - return MDB_NOTFOUND; - if (unlikely((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)) - return MDB_INCOMPATIBLE; /* not a named DB */ - rc = mdb_node_read(&mc2, leaf, &data); - if (rc) - return rc; - memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), - sizeof(uint16_t)); - /* The txn may not know this DBI, or another process may - * have dropped and recreated the DB with other flags. - */ - if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)) - return MDB_INCOMPATIBLE; - memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); - } - *mc->mc_dbflag &= ~DB_STALE; - } - root = mc->mc_db->md_root; - - if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - mdb_debug("tree is empty"); - return MDB_NOTFOUND; - } - } - - mdb_cassert(mc, root > 1); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if (unlikely((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) - return rc; - - mc->mc_snum = 1; - mc->mc_top = 0; - - mdb_debug("db %d root page %zu has flags 0x%X", - DDBI(mc), root, mc->mc_pg[0]->mp_flags); - - if (flags & MDB_PS_MODIFY) { - if (unlikely(rc = mdb_page_touch(mc))) - return rc; - } - - if (flags & MDB_PS_ROOTONLY) - return MDB_SUCCESS; - - return mdb_page_search_root(mc, key, flags); -} - -static int -mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) -{ - MDB_txn *txn = mc->mc_txn; - pgno_t pg = mp->mp_pgno; - unsigned x = 0, ovpages = mp->mp_pages; - MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pgs; - MDB_ID pn = pg << 1; - int rc; - - mdb_debug("free ov page %zu (%u)", pg, ovpages); - /* If the page is dirty or on the spill list we just acquired it, - * so we should give it back to our current free list, if any. - * Otherwise put it onto the list of pages we freed in this txn. - * - * Won't create me_pghead: me_pglast must be inited along with it. - * Unsupported in nested txns: They would need to hide the page - * range in ancestor txns' dirty and spilled lists. - */ - if (env->me_pghead && - !txn->mt_parent && - ((mp->mp_flags & P_DIRTY) || - (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) - { - unsigned i, j; - pgno_t *mop; - MDB_ID2 *dl, ix, iy; - rc = mdb_midl_need(&env->me_pghead, ovpages); - if (unlikely(rc)) - return rc; - if (!(mp->mp_flags & P_DIRTY)) { - /* This page is no longer spilled */ - if (x == sl[0]) - sl[0]--; - else - sl[x] |= 1; - goto release; - } - /* Remove from dirty list */ - dl = txn->mt_u.dirty_list; - x = dl[0].mid--; - for (ix = dl[x]; ix.mptr != mp; ix = iy) { - if (likely(x > 1)) { - x--; - iy = dl[x]; - dl[x] = ix; - } else { - mdb_cassert(mc, x > 1); - j = ++(dl[0].mid); - dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - } - txn->mt_dirty_room++; - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dpage_free(env, mp); -release: - /* Insert in me_pghead */ - mop = env->me_pghead; - j = mop[0] + ovpages; - for (i = mop[0]; i && mop[i] < pg; i--) - mop[j--] = mop[i]; - while (j>i) - mop[j--] = pg++; - mop[0] += ovpages; - } else { - rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); - if (unlikely(rc)) - return rc; - } - mc->mc_db->md_overflow_pages -= ovpages; - return 0; -} - -/** Return the data associated with a given node. - * @param[in] mc The cursor for this operation. - * @param[in] leaf The node being read. - * @param[out] data Updated to point to the node's data. - * @return 0 on success, non-zero on failure. - */ -static MDBX_INLINE int -mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) -{ - MDB_page *omp; /* overflow page */ - pgno_t pgno; - int rc; - - if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { - data->mv_size = NODEDSZ(leaf); - data->mv_data = NODEDATA(leaf); - return MDB_SUCCESS; - } - - /* Read overflow data. - */ - data->mv_size = NODEDSZ(leaf); - memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if (unlikely((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0)) { - mdb_debug("read overflow page %zu failed", pgno); - return rc; - } - data->mv_data = PAGEDATA(omp); - - return MDB_SUCCESS; -} - -int -mdb_get(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data) -{ - MDB_cursor mc; - MDB_xcursor mx; - int exact = 0; - DKBUF; - - mdb_debug("===> get db %u key [%s]", dbi, DKEY(key)); - - if (unlikely(!key || !data || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); -} - -/** Find a sibling for a page. - * Replaces the page at the top of the cursor's stack with the - * specified sibling, if one exists. - * @param[in] mc The cursor for this operation. - * @param[in] move_right Non-zero if the right sibling is requested, - * otherwise the left sibling. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_cursor_sibling(MDB_cursor *mc, int move_right) -{ - int rc; - MDB_node *indx; - MDB_page *mp; - - if (unlikely(mc->mc_snum < 2)) { - return MDB_NOTFOUND; /* root has no siblings */ - } - - mdb_cursor_pop(mc); - mdb_debug("parent page is page %zu, index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); - - if (move_right - ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { - mdb_debug("no more keys left, moving to %s sibling", - move_right ? "right" : "left"); - if (unlikely((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS)) { - /* undo cursor_pop before returning */ - mc->mc_top++; - mc->mc_snum++; - return rc; - } - } else { - if (move_right) - mc->mc_ki[mc->mc_top]++; - else - mc->mc_ki[mc->mc_top]--; - mdb_debug("just moving to %s index key %u", - move_right ? "right" : "left", mc->mc_ki[mc->mc_top]); - } - mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - - indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0)) { - /* mc will be inconsistent if caller does mc_snum++ as above */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - return rc; - } - - mdb_cursor_push(mc, mp); - if (!move_right) - mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; - - return MDB_SUCCESS; -} - -/** Move the cursor to the next data item. */ -static int -mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) -{ - MDB_page *mp; - MDB_node *leaf; - int rc; - - if ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP) - return MDB_NOTFOUND; - - if (!(mc->mc_flags & C_INITIALIZED)) - return mdb_cursor_first(mc, key, data); - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) - return MDB_NOTFOUND; - mc->mc_flags ^= C_EOF; - } - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_NEXT || op == MDB_NEXT_DUP) { - rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc != MDB_NOTFOUND) { - if (likely(rc == MDB_SUCCESS)) - MDB_GET_KEY(leaf, key); - return rc; - } - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_NEXT_DUP) - return MDB_NOTFOUND; - } - } - - mdb_debug("cursor_next: top page is %zu in cursor %p", - mdb_dbg_pgno(mp), (void *) mc); - if (mc->mc_flags & C_DEL) { - mc->mc_flags ^= C_DEL; - goto skip; - } - - if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { - mdb_debug("=====> move to next sibling page"); - if (unlikely((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { - mc->mc_flags |= C_EOF; - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - mdb_debug("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]++; - -skip: - mdb_debug("==> cursor points to page %zu with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Move the cursor to the previous data item. */ -static int -mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) -{ - MDB_page *mp; - MDB_node *leaf; - int rc; - - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (unlikely(rc)) - return rc; - mc->mc_ki[mc->mc_top]++; - } - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_PREV || op == MDB_PREV_DUP) { - rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc != MDB_NOTFOUND) { - if (likely(rc == MDB_SUCCESS)) { - MDB_GET_KEY(leaf, key); - mc->mc_flags &= ~C_EOF; - } - return rc; - } - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_PREV_DUP) - return MDB_NOTFOUND; - } - } - - mdb_debug("cursor_prev: top page is %zu in cursor %p", - mdb_dbg_pgno(mp), (void *) mc); - - mc->mc_flags &= ~(C_EOF|C_DEL); - - if (mc->mc_ki[mc->mc_top] == 0) { - mdb_debug("=====> move to prev sibling page"); - if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - mdb_debug("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]--; - - mdb_debug("==> cursor points to page %zu with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Set the cursor on a specific data item. */ -static int -mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, - MDB_cursor_op op, int *exactp) -{ - int rc; - MDB_page *mp; - MDB_node *leaf = NULL; - DKBUF; - - if ( (mc->mc_db->md_flags & MDB_INTEGERKEY) - && unlikely( key->mv_size != sizeof(unsigned) - && key->mv_size != sizeof(size_t) )) { - mdb_cassert(mc, ! "key-size is invalid for MDB_INTEGERKEY"); - return MDB_BAD_VALSIZE; - } - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - /* See if we're already on the right page */ - if (mc->mc_flags & C_INITIALIZED) { - MDB_val nodekey; - - mp = mc->mc_pg[mc->mc_top]; - if (!NUMKEYS(mp)) { - mc->mc_ki[mc->mc_top] = 0; - return MDB_NOTFOUND; - } - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_size = mc->mc_db->md_xsize; - nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, 0); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* Probably happens rarely, but first node on the page - * was the one we wanted. - */ - mc->mc_ki[mc->mc_top] = 0; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc > 0) { - unsigned i; - unsigned nkeys = NUMKEYS(mp); - if (nkeys > 1) { - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - nkeys-1, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, nkeys-1); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* last node was the one we wanted */ - mc->mc_ki[mc->mc_top] = nkeys-1; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc < 0) { - if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { - /* This is definitely the right page, skip search_page */ - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - mc->mc_ki[mc->mc_top], nodekey.mv_size); - } else { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* current node was the one we wanted */ - if (exactp) - *exactp = 1; - goto set1; - } - } - rc = 0; - mc->mc_flags &= ~C_EOF; - goto set2; - } - } - /* If any parents have right-sibs, search. - * Otherwise, there's nothing further. */ - for (i=0; imc_top; i++) - if (mc->mc_ki[i] < - NUMKEYS(mc->mc_pg[i])-1) - break; - if (i == mc->mc_top) { - /* There are no other pages */ - mc->mc_ki[mc->mc_top] = nkeys; - return MDB_NOTFOUND; - } - } - if (!mc->mc_top) { - /* There are no other pages */ - mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE && !exactp) { - rc = 0; - goto set1; - } else - return MDB_NOTFOUND; - } - } else { - mc->mc_pg[0] = 0; - } - - rc = mdb_page_search(mc, key, 0); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - -set2: - leaf = mdb_node_search(mc, key, exactp); - if (exactp != NULL && !*exactp) { - /* MDB_SET specified and not an exact match. */ - return MDB_NOTFOUND; - } - - if (leaf == NULL) { - mdb_debug("===> inexact leaf not found, goto sibling"); - if (unlikely((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { - mc->mc_flags |= C_EOF; - return rc; /* no entries matched */ - } - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, 0); - } - -set1: - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - if (IS_LEAF2(mp)) { - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } - return MDB_SUCCESS; - } - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2, *ex2p; - if (op == MDB_GET_BOTH) { - ex2p = &ex2; - ex2 = 0; - } else { - ex2p = NULL; - } - rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { - MDB_val olddata; - if (unlikely((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) - return rc; - rc = mc->mc_dbx->md_dcmp(data, &olddata); - if (rc) { - if (op == MDB_GET_BOTH || rc > 0) - return MDB_NOTFOUND; - rc = 0; - } - *data = olddata; - } else { - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - } - } - - /* The key already matches in all other cases */ - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) - MDB_GET_KEY(leaf, key); - mdb_debug("==> cursor placed on key [%s]", DKEY(key)); - - return rc; -} - -/** Move the cursor to the first item in the database. */ -static int -mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) -{ - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - mc->mc_ki[mc->mc_top] = 0; - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); - return MDB_SUCCESS; - } - - if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - } - } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Move the cursor to the last item in the database. */ -static int -mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) -{ - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (likely(!(mc->mc_flags & C_EOF))) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - } - - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; - mc->mc_flags |= C_INITIALIZED|C_EOF; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -int -mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, - MDB_cursor_op op) -{ - int rc; - int exact = 0; - int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); - - if (unlikely(mc == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - switch (op) { - case MDB_GET_CURRENT: - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - } else { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int nkeys = NUMKEYS(mp); - if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { - mc->mc_ki[mc->mc_top] = nkeys; - rc = MDB_NOTFOUND; - break; - } - rc = MDB_SUCCESS; - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } else { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY(leaf, key); - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - break; - } - rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); - } else { - rc = mdb_node_read(mc, leaf, data); - } - } - } - } - break; - case MDB_GET_BOTH: - case MDB_GET_BOTH_RANGE: - if (unlikely(data == NULL)) { - rc = EINVAL; - break; - } - if (unlikely(mc->mc_xcursor == NULL)) { - rc = MDB_INCOMPATIBLE; - break; - } - /* FALLTHRU */ - case MDB_SET: - case MDB_SET_KEY: - case MDB_SET_RANGE: - if (unlikely(key == NULL)) { - rc = EINVAL; - } else { - rc = mdb_cursor_set(mc, key, data, op, - op == MDB_SET_RANGE ? NULL : &exact); - } - break; - case MDB_GET_MULTIPLE: - if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { - rc = MDB_INCOMPATIBLE; - break; - } - rc = MDB_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) - break; - goto fetchm; - case MDB_NEXT_MULTIPLE: - if (unlikely(data == NULL)) { - rc = EINVAL; - break; - } - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { - rc = MDB_INCOMPATIBLE; - break; - } - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); - if (rc == MDB_SUCCESS) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - MDB_cursor *mx; -fetchm: - mx = &mc->mc_xcursor->mx_cursor; - data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * - mx->mc_db->md_xsize; - data->mv_data = PAGEDATA(mx->mc_pg[mx->mc_top]); - mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; - } else { - rc = MDB_NOTFOUND; - } - } - break; - case MDB_PREV_MULTIPLE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_last(mc, key, data); - else - rc = MDB_SUCCESS; - if (rc == MDB_SUCCESS) { - MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; - if (mx->mc_flags & C_INITIALIZED) { - rc = mdb_cursor_sibling(mx, 0); - if (rc == MDB_SUCCESS) - goto fetchm; - } else { - rc = MDB_NOTFOUND; - } - } - break; - case MDB_NEXT: - case MDB_NEXT_DUP: - case MDB_NEXT_NODUP: - rc = mdb_cursor_next(mc, key, data, op); - break; - case MDB_PREV: - case MDB_PREV_DUP: - case MDB_PREV_NODUP: - rc = mdb_cursor_prev(mc, key, data, op); - break; - case MDB_FIRST: - rc = mdb_cursor_first(mc, key, data); - break; - case MDB_FIRST_DUP: - mfunc = mdb_cursor_first; - mmove: - if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - if (unlikely(mc->mc_xcursor == NULL)) { - rc = MDB_INCOMPATIBLE; - break; - } - { - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - MDB_GET_KEY(leaf, key); - rc = mdb_node_read(mc, leaf, data); - break; - } - } - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); - break; - case MDB_LAST: - rc = mdb_cursor_last(mc, key, data); - break; - case MDB_LAST_DUP: - mfunc = mdb_cursor_last; - goto mmove; - default: - mdb_debug("unhandled/unimplemented cursor operation %u", op); - rc = EINVAL; - break; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - return rc; -} - -/** Touch all the pages in the cursor stack. Set mc_top. - * Makes sure all the pages are writable, before attempting a write operation. - * @param[in] mc The cursor to operate on. - */ -static int -mdb_cursor_touch(MDB_cursor *mc) -{ - int rc = MDB_SUCCESS; - - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { - /* Touch DB record of named DB */ - MDB_cursor mc2; - MDB_xcursor mcx; - if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) - return MDB_BAD_DBI; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); - if (unlikely(rc)) - return rc; - *mc->mc_dbflag |= DB_DIRTY; - } - mc->mc_top = 0; - if (mc->mc_snum) { - do { - rc = mdb_page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); - mc->mc_top = mc->mc_snum-1; - } - return rc; -} - -/** Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDB_NOSPILL 0x8000 - -int -mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, - unsigned flags) -{ - MDB_env *env; - MDB_node *leaf = NULL; - MDB_page *fp, *mp, *sub_root = NULL; - uint16_t fp_flags; - MDB_val xdata, *rdata, dkey, olddata; - MDB_db dummy; - int do_sub = 0, insert_key, insert_data; - unsigned mcount = 0, dcount = 0, nospill; - size_t nsize; - int rc, rc2; - unsigned nflags; - DKBUF; - - if (unlikely(mc == NULL || key == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - env = mc->mc_txn->mt_env; - - /* Check this first so counter will always be zero on any - * early failures. - */ - if (flags & MDB_MULTIPLE) { - dcount = data[1].mv_size; - data[1].mv_size = 0; - if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))) - return MDB_INCOMPATIBLE; - } - - if (flags & MDB_RESERVE) { - if (unlikely(mc->mc_db->md_flags & (MDB_DUPSORT | MDB_REVERSEDUP))) - return MDB_INCOMPATIBLE; - } - - nospill = flags & MDB_NOSPILL; - flags &= ~MDB_NOSPILL; - - if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (unlikely(key->mv_size > ENV_MAXKEY(env))) - return MDB_BAD_VALSIZE; - -#if SIZE_MAX > MAXDATASIZE - if (unlikely(data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))) - return MDB_BAD_VALSIZE; -#else - if ((mc->mc_db->md_flags & MDB_DUPSORT) && unlikely(data->mv_size > ENV_MAXKEY(env))) - return MDB_BAD_VALSIZE; -#endif - - if ((mc->mc_db->md_flags & MDB_INTEGERKEY) - && unlikely(key->mv_size != sizeof(unsigned) - && key->mv_size != sizeof(size_t) )) { - mdb_cassert(mc, ! "key-size is invalid for MDB_INTEGERKEY"); - return MDB_BAD_VALSIZE; - } - - if ((mc->mc_db->md_flags & MDB_INTEGERDUP) - && unlikely(data->mv_size != sizeof(unsigned) - && data->mv_size != sizeof(size_t) )) { - mdb_cassert(mc, ! "data-size is invalid MDB_INTEGERDUP"); - return MDB_BAD_VALSIZE; - } - - mdb_debug("==> put db %d key [%s], size %zu, data size %zu", - DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size); - - int dupdata_flag = 0; - if (flags & MDB_CURRENT) { - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; -#if MDBX_MODE_ENABLED - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_cassert(mc, mc->mc_xcursor != NULL - && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - if (mc->mc_xcursor->mx_db.md_entries > 1) { - rc = mdbx_cursor_del(mc, 0); - if (rc != MDB_SUCCESS) - return rc; - flags -= MDB_CURRENT; - } - } - } -#endif /* MDBX_MODE_ENABLED */ - rc = MDB_SUCCESS; - } else if (mc->mc_db->md_root == P_INVALID) { - /* new database, cursor has nothing to point to */ - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - rc = MDB_NO_ROOT; - } else { - int exact = 0; - MDB_val d2; - if (flags & MDB_APPEND) { - MDB_val k2; - rc = mdb_cursor_last(mc, &k2, &d2); - if (rc == 0) { - rc = mc->mc_dbx->md_cmp(key, &k2); - if (rc > 0) { - rc = MDB_NOTFOUND; - mc->mc_ki[mc->mc_top]++; - } else { - /* new key is <= last key */ - rc = MDB_KEYEXIST; - } - } - } else { - rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); - } - if ((flags & MDB_NOOVERWRITE) && rc == 0) { - mdb_debug("duplicate key [%s]", DKEY(key)); - *data = d2; - return MDB_KEYEXIST; - } - if (rc && unlikely(rc != MDB_NOTFOUND)) - return rc; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - /* Cursor is positioned, check for room in the dirty list */ - if (!nospill) { - if (flags & MDB_MULTIPLE) { - rdata = &xdata; - xdata.mv_size = data->mv_size * dcount; - } else { - rdata = data; - } - if (unlikely(rc2 = mdb_page_spill(mc, key, rdata))) - return rc2; - } - - if (rc == MDB_NO_ROOT) { - MDB_page *np; - /* new database, write a root leaf page */ - mdb_debug("allocating new root leaf page"); - if (unlikely(rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { - return rc2; - } - mdb_cursor_push(mc, np); - mc->mc_db->md_root = np->mp_pgno; - mc->mc_db->md_depth++; - *mc->mc_dbflag |= DB_DIRTY; - if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) == MDB_DUPFIXED) - np->mp_flags |= P_LEAF2; - mc->mc_flags |= C_INITIALIZED; - } else { - /* make sure all cursor pages are writable */ - rc2 = mdb_cursor_touch(mc); - if (unlikely(rc2)) - return rc2; - } - - insert_key = insert_data = rc; - if (insert_key) { - /* The key does not exist */ - mdb_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - LEAFSIZE(key, data) > env->me_nodemax) - { - /* Too big for a node, insert in sub-DB. Set up an empty - * "old sub-page" for prep_subDB to expand to a full page. - */ - fp_flags = P_LEAF|P_DIRTY; - fp = env->me_pbuf; - fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */ - fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); - olddata.mv_size = PAGEHDRSZ; - goto prep_subDB; - } - } else { - /* there's only a key anyway, so this is a no-op */ - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - char *ptr; - unsigned ksize = mc->mc_db->md_xsize; - if (key->mv_size != ksize) - return MDB_BAD_VALSIZE; - ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->mv_data, ksize); -fix_parent: - /* if overwriting slot 0 of leaf, need to - * update branch key if there is a parent page - */ - if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short dtop = 1; - mc->mc_top--; - /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - mc->mc_top--; - dtop++; - } - if (mc->mc_ki[mc->mc_top]) - rc2 = mdb_update_key(mc, key); - else - rc2 = MDB_SUCCESS; - mc->mc_top += dtop; - if (rc2) - return rc2; - } - return MDB_SUCCESS; - } - -more: - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - olddata.mv_size = NODEDSZ(leaf); - olddata.mv_data = NODEDATA(leaf); - - /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - /* Prepare (sub-)page/sub-DB to accept the new item, - * if needed. fp: old sub-page or a header faking - * it. mp: new (sub-)page. offset: growth in page - * size. xdata: node data with new page or DB. - */ - unsigned i, offset = 0; - mp = fp = xdata.mv_data = env->me_pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; - - /* Was a single item before, must convert now */ - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - /* Just overwrite the current item */ - if (flags & MDB_CURRENT) { - if ((flags & MDB_NODUPDATA) && !mc->mc_dbx->md_dcmp(data, &olddata)) - return MDB_KEYEXIST; - goto current; - } - - /* does data match? */ - if (!mc->mc_dbx->md_dcmp(data, &olddata)) { - if (unlikely(flags & (MDB_NODUPDATA|MDB_APPENDDUP))) - return MDB_KEYEXIST; - /* overwrite it */ - goto current; - } - - /* Back up original data item */ - dupdata_flag = 1; - dkey.mv_size = olddata.mv_size; - dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); - - /* Make sub-page header for the dup items, with dummy body */ - fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; - fp->mp_lower = (PAGEHDRSZ-PAGEBASE); - xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp->mp_flags |= P_LEAF2; - fp->mp_leaf2_ksize = data->mv_size; - xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ - } else { - xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.mv_size & 1) + (data->mv_size & 1); - } - fp->mp_upper = xdata.mv_size - PAGEBASE; - olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ - } else if (leaf->mn_flags & F_SUBDATA) { - /* Data is on sub-DB, just store it */ - flags |= F_DUPDATA|F_SUBDATA; - goto put_sub; - } else { - /* Data is on sub-page */ - fp = olddata.mv_data; - switch (flags) { - default: - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - offset = EVEN(NODESIZE + sizeof(indx_t) + - data->mv_size); - break; - } - offset = fp->mp_leaf2_ksize; - if (SIZELEFT(fp) < offset) { - offset *= 4; /* space for 4 more */ - break; - } - /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ - case MDB_CURRENT | MDB_NODUPDATA: - case MDB_CURRENT: - fp->mp_flags |= P_DIRTY; - COPY_PGNO(fp->mp_pgno, mp->mp_pgno); - mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; - flags |= F_DUPDATA; - goto put_sub; - } - xdata.mv_size = olddata.mv_size + offset; - } - - fp_flags = fp->mp_flags; - if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { - /* Too big for a sub-page, convert to sub-DB */ - fp_flags &= ~P_SUBP; -prep_subDB: - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp_flags |= P_LEAF2; - dummy.md_xsize = fp->mp_leaf2_ksize; - dummy.md_flags = MDB_DUPFIXED; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - dummy.md_flags |= MDB_INTEGERKEY; - } else { - dummy.md_xsize = 0; - dummy.md_flags = 0; - } - dummy.md_depth = 1; - dummy.md_branch_pages = 0; - dummy.md_leaf_pages = 1; - dummy.md_overflow_pages = 0; - dummy.md_entries = NUMKEYS(fp); - xdata.mv_size = sizeof(MDB_db); - xdata.mv_data = &dummy; - if ((rc = mdb_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) - return rc; - offset = env->me_psize - olddata.mv_size; - flags |= F_DUPDATA|F_SUBDATA; - dummy.md_root = mp->mp_pgno; - sub_root = mp; - } - if (mp != fp) { - mp->mp_flags = fp_flags | P_DIRTY; - mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; - mp->mp_lower = fp->mp_lower; - mp->mp_upper = fp->mp_upper + offset; - if (fp_flags & P_LEAF2) { - memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); - } else { - memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, - olddata.mv_size - fp->mp_upper - PAGEBASE); - for (i=0; imp_ptrs[i] = fp->mp_ptrs[i] + offset; - } - } - - rdata = &xdata; - flags |= F_DUPDATA; - do_sub = 1; - if (!insert_key) - mdb_node_del(mc, 0); - goto new_sub; - } -current: - /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ - if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) - return MDB_INCOMPATIBLE; - /* overflow page overwrites need special handling */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); - - memcpy(&pg, olddata.mv_data, sizeof(pg)); - if (unlikely((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0)) - return rc2; - ovpages = omp->mp_pages; - - /* Is the ov page large enough? */ - if (ovpages >= dpages) { - if (!(omp->mp_flags & P_DIRTY) - && (level || (env->me_flags & MDB_WRITEMAP))) { - rc = mdb_page_unspill(mc->mc_txn, omp, &omp); - if (unlikely(rc)) - return rc; - level = 0; /* dirty in this txn or clean */ - } - /* Is it dirty? */ - if (omp->mp_flags & P_DIRTY) { - /* yes, overwrite it. Note in this case we don't - * bother to try shrinking the page if the new data - * is smaller than the overflow threshold. - */ - if (unlikely(level > 1)) { - /* It is writable only in a parent txn */ - MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); - MDB_ID2 id2; - if (unlikely(!np)) - return ENOMEM; - id2.mid = pg; - id2.mptr = np; - /* Note - this page is already counted in parent's dirty_room */ - rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); - mdb_cassert(mc, rc2 == 0); - /* Currently we make the page look as with put() in the - * parent txn, in case the user peeks at MDB_RESERVEd - * or unused parts. Some users treat ovpages specially. - */ - size_t sz = (size_t) env->me_psize * ovpages, off; - if (MDBX_MODE_ENABLED || !(flags & MDB_RESERVE)) { - /* Skip the part where LMDB will put *data. - * Copy end of page, adjusting alignment so - * compiler may copy words instead of bytes. - */ - off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); - memcpy((size_t *)((char *)np + off), - (size_t *)((char *)omp + off), sz - off); - sz = PAGEHDRSZ; - } - memcpy(np, omp, sz); /* Copy whole or header of page */ - omp = np; - } - SETDSZ(leaf, data->mv_size); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = PAGEDATA(omp); - else - memcpy(PAGEDATA(omp), data->mv_data, data->mv_size); - return MDB_SUCCESS; - } - } - if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) - return rc2; - } else if (data->mv_size == olddata.mv_size) { - /* same size, just replace it. Note that we could - * also reuse this node if the new data is smaller, - * but instead we opt to shrink the node in that case. - */ - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = olddata.mv_data; - else if (!(mc->mc_flags & C_SUB)) - memcpy(olddata.mv_data, data->mv_data, data->mv_size); - else { - memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); - goto fix_parent; - } - return MDB_SUCCESS; - } - mdb_node_del(mc, 0); - } - - rdata = data; - -new_sub: - nflags = flags & NODE_ADD_FLAGS; - nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); - if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { - if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) - nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ - if (!insert_key) - nflags |= MDB_SPLIT_REPLACE; - rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); - } else { - /* There is room already in this leaf page. */ - rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (likely(rc == 0)) { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; - if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { - m3->mc_ki[i]++; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); - } - } - } - - if (likely(rc == MDB_SUCCESS)) { - /* Now store the actual data in the child DB. Note that we're - * storing the user data in the keys field, so there are strict - * size limits on dupdata. The actual data fields of the child - * DB are all zero size. */ - if (do_sub) { - int xflags; - size_t ecount; -put_sub: - xdata.mv_size = 0; - xdata.mv_data = ""; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (flags & MDB_CURRENT) { - xflags = (flags & MDB_NODUPDATA) ? - MDB_CURRENT|MDB_NOOVERWRITE|MDB_NOSPILL : MDB_CURRENT|MDB_NOSPILL; - } else { - mdb_xcursor_init1(mc, leaf); - xflags = (flags & MDB_NODUPDATA) ? - MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; - } - if (sub_root) - mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; - /* converted, write the original data first */ - if (dupdata_flag) { - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); - if (unlikely(rc)) - goto bad_sub; - /* we've done our job */ - dkey.mv_size = 0; - } - if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2; - MDB_xcursor *mx = mc->mc_xcursor; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - int nkeys = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp) { - if (m2->mc_ki[i] == mc->mc_ki[i]) { - mdb_xcursor_init2(m2, mx, dupdata_flag); - } else if (!insert_key && m2->mc_ki[i] < nkeys) { - XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); - } - } - } - } - ecount = mc->mc_xcursor->mx_db.md_entries; - if (flags & MDB_APPENDDUP) - xflags |= MDB_APPEND; - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); - if (flags & F_SUBDATA) { - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } - insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; - } - /* Increment count unless we just replaced an existing item. */ - if (insert_data) - mc->mc_db->md_entries++; - if (insert_key) { - /* Invalidate txn if we created an empty sub-DB */ - if (unlikely(rc)) - goto bad_sub; - /* If we succeeded and the key didn't exist before, - * make sure the cursor is marked valid. */ - mc->mc_flags |= C_INITIALIZED; - } - if (flags & MDB_MULTIPLE) { - if (!rc) { - mcount++; - /* let caller know how many succeeded, if any */ - data[1].mv_size = mcount; - if (mcount < dcount) { - data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; - insert_key = insert_data = 0; - goto more; - } - } - } - return rc; -bad_sub: - if (unlikely(rc == MDB_KEYEXIST)) /* should not happen, we deleted that item */ - rc = MDB_PROBLEM; - } - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_cursor_del(MDB_cursor *mc, unsigned flags) -{ - MDB_node *leaf; - MDB_page *mp; - int rc; - - if (unlikely(!mc)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; - - if (unlikely(mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))) - return MDB_NOTFOUND; - - if (unlikely(!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))) - return rc; - - rc = mdb_cursor_touch(mc); - if (unlikely(rc)) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - if (IS_LEAF2(mp)) - goto del_key; - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (flags & MDB_NODUPDATA) { - /* mdb_cursor_del0() will subtract the final entry */ - mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; - } else { - if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); - if (unlikely(rc)) - return rc; - /* If sub-DB still has entries, we're done */ - if (mc->mc_xcursor->mx_db.md_entries) { - if (leaf->mn_flags & F_SUBDATA) { - /* update subDB info */ - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } else { - MDB_cursor *m2; - /* shrink fake page */ - mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - /* fix other sub-DB cursors pointed at fake pages on this page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - MDB_node *n2 = leaf; - if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { - n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); - if (n2->mn_flags & F_SUBDATA) continue; - } - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); - } - } - } - mc->mc_db->md_entries--; - return rc; - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; - } - /* otherwise fall thru and delete the sub-DB */ - } - - if (leaf->mn_flags & F_SUBDATA) { - /* add all the child DB's pages to the free list */ - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (unlikely(rc)) - goto fail; - } - } - /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ - else if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) { - rc = MDB_INCOMPATIBLE; - goto fail; - } - - /* add overflow pages to free list */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if (unlikely((rc = mdb_page_get(mc, pg, &omp, NULL)) || - (rc = mdb_ovpage_free(mc, omp)))) - goto fail; - } - -del_key: - return mdb_cursor_del0(mc); - -fail: - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -/** Allocate and initialize new pages for a database. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc a cursor on the database being added to. - * @param[in] flags flags defining what type of page is being allocated. - * @param[in] num the number of pages to allocate. This is usually 1, - * unless allocating overflow pages for a large record. - * @param[out] mp Address of a page, or NULL on failure. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) -{ - MDB_page *np; - int rc; - - if (unlikely((rc = mdb_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) - return rc; - mdb_debug("allocated new mpage %zu, page size %u", - np->mp_pgno, mc->mc_txn->mt_env->me_psize); - np->mp_flags = flags | P_DIRTY; - np->mp_lower = (PAGEHDRSZ-PAGEBASE); - np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; - - if (IS_BRANCH(np)) - mc->mc_db->md_branch_pages++; - else if (IS_LEAF(np)) - mc->mc_db->md_leaf_pages++; - else if (IS_OVERFLOW(np)) { - mc->mc_db->md_overflow_pages += num; - np->mp_pages = num; - } - *mp = np; - - return 0; -} - -/** Calculate the size of a leaf node. - * The size depends on the environment's page size; if a data item - * is too large it will be put onto an overflow page and the node - * size will only include the key and not the data. Sizes are always - * rounded up to an even number of bytes, to guarantee 2-byte alignment - * of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @param[in] data The data for the node. - * @return The number of bytes needed to store the node. - */ -static MDBX_INLINE size_t -mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) -{ - size_t sz; - - sz = LEAFSIZE(key, data); - if (sz > env->me_nodemax) { - /* put on overflow page */ - sz -= data->mv_size - sizeof(pgno_t); - } - - return EVEN(sz + sizeof(indx_t)); -} - -/** Calculate the size of a branch node. - * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto overflow - * pages, it's simply the size of the #MDB_node header plus the - * size of the key. Sizes are always rounded up to an even number - * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @return The number of bytes needed to store the node. - */ -static MDBX_INLINE size_t -mdb_branch_size(MDB_env *env, MDB_val *key) -{ - size_t sz; - - sz = INDXSIZE(key); - if (unlikely(sz > env->me_nodemax)) { - /* put on overflow page */ - /* not implemented */ - mdb_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __FUNCTION__, __LINE__); - sz -= key->mv_size - sizeof(pgno_t); - } - - return sz + sizeof(indx_t); -} - -/** Add a node to the page pointed to by the cursor. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc The cursor for this operation. - * @param[in] indx The index on the page where the new node should be added. - * @param[in] key The key for the new node. - * @param[in] data The data for the new node, if any. - * @param[in] pgno The page number, if adding a branch node. - * @param[in] flags Flags for the node. - * @return 0 on success, non-zero on failure. Possible errors are: - *
    - *
  • ENOMEM - failed to allocate overflow pages for the node. - *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error - * should never happen since all callers already calculate the - * page's free space before calling this function. - *
- */ -static int -mdb_node_add(MDB_cursor *mc, indx_t indx, - MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags) -{ - unsigned i; - size_t node_size = NODESIZE; - ssize_t room; - indx_t ofs; - MDB_node *node; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_page *ofp = NULL; /* overflow page */ - void *ndata; - DKBUF; - - mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - - mdb_debug("add to %s %spage %zu index %i, data size %zu key size %zu [%s]", - IS_LEAF(mp) ? "leaf" : "branch", - IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, key ? DKEY(key) : "null"); - - if (IS_LEAF2(mp)) { - mdb_cassert(mc, key); - /* Move higher keys up one slot. */ - int ksize = mc->mc_db->md_xsize, dif; - char *ptr = LEAF2KEY(mp, indx, ksize); - dif = NUMKEYS(mp) - indx; - if (dif > 0) - memmove(ptr+ksize, ptr, dif*ksize); - /* insert new key */ - memcpy(ptr, key->mv_data, ksize); - - /* Just using these for counting */ - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - return MDB_SUCCESS; - } - - room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); - if (key != NULL) - node_size += key->mv_size; - if (IS_LEAF(mp)) { - mdb_cassert(mc, key && data); - if (unlikely(F_ISSET(flags, F_BIGDATA))) { - /* Data already on overflow page. */ - node_size += sizeof(pgno_t); - } else if (unlikely(node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax)) { - int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); - int rc; - /* Put data on overflow page. */ - mdb_debug("data size is %zu, node would be %zu, put data on overflow page", - data->mv_size, node_size+data->mv_size); - node_size = EVEN(node_size + sizeof(pgno_t)); - if ((ssize_t)node_size > room) - goto full; - if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) - return rc; - mdb_debug("allocated overflow page %zu", ofp->mp_pgno); - flags |= F_BIGDATA; - goto update; - } else { - node_size += data->mv_size; - } - } - node_size = EVEN(node_size); - if (unlikely((ssize_t)node_size > room)) - goto full; - -update: - /* Move higher pointers up one slot. */ - for (i = NUMKEYS(mp); i > indx; i--) - mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; - - /* Adjust free space offsets. */ - ofs = mp->mp_upper - node_size; - mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mp->mp_ptrs[indx] = ofs; - mp->mp_upper = ofs; - mp->mp_lower += sizeof(indx_t); - - /* Write the node data. */ - node = NODEPTR(mp, indx); - node->mn_ksize = (key == NULL) ? 0 : key->mv_size; - node->mn_flags = flags; - if (IS_LEAF(mp)) - SETDSZ(node,data->mv_size); - else - SETPGNO(node,pgno); - - if (key) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - if (IS_LEAF(mp)) { - ndata = NODEDATA(node); - if (unlikely(ofp == NULL)) { - if (unlikely(F_ISSET(flags, F_BIGDATA))) - memcpy(ndata, data->mv_data, sizeof(pgno_t)); - else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; - else if (likely(ndata != data->mv_data)) - memcpy(ndata, data->mv_data, data->mv_size); - } else { - memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); - ndata = PAGEDATA(ofp); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; - else if (likely(ndata != data->mv_data)) - memcpy(ndata, data->mv_data, data->mv_size); - } - } - - return MDB_SUCCESS; - -full: - mdb_debug("not enough room in page %zu, got %u ptrs", - mdb_dbg_pgno(mp), NUMKEYS(mp)); - mdb_debug("upper-lower = %u - %u = %zd", mp->mp_upper,mp->mp_lower,room); - mdb_debug("node size = %zu", node_size); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_FULL; -} - -/** Delete the specified node from a page. - * @param[in] mc Cursor pointing to the node to delete. - * @param[in] ksize The size of a node. Only used if the page is - * part of a #MDB_DUPFIXED database. - */ -static void -mdb_node_del(MDB_cursor *mc, int ksize) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - indx_t indx = mc->mc_ki[mc->mc_top]; - unsigned sz; - indx_t i, j, numkeys, ptr; - MDB_node *node; - char *base; - - mdb_debug("delete node %u on %s page %zu", indx, - IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp)); - numkeys = NUMKEYS(mp); - mdb_cassert(mc, indx < numkeys); - - if (IS_LEAF2(mp)) { - int x = numkeys - 1 - indx; - base = LEAF2KEY(mp, indx, ksize); - if (x) - memmove(base, base + ksize, x * ksize); - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += ksize - sizeof(indx_t); - return; - } - - node = NODEPTR(mp, indx); - sz = NODESIZE + node->mn_ksize; - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += NODEDSZ(node); - } - sz = EVEN(sz); - - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < numkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) - mp->mp_ptrs[j] += sz; - j++; - } - } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + sz, base, ptr - mp->mp_upper); - - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += sz; -} - -/** Compact the main page after deleting a node on a subpage. - * @param[in] mp The main page to operate on. - * @param[in] indx The index of the subpage on the main page. - */ -static void -mdb_node_shrink(MDB_page *mp, indx_t indx) -{ - MDB_node *node; - MDB_page *sp, *xp; - char *base; - indx_t delta, nsize, len, ptr; - int i; - - node = NODEPTR(mp, indx); - sp = (MDB_page *)NODEDATA(node); - delta = SIZELEFT(sp); - nsize = NODEDSZ(node) - delta; - - /* Prepare to shift upward, set len = length(subpage part to shift) */ - if (IS_LEAF2(sp)) { - len = nsize; - if (nsize & 1) - return; /* do not make the node uneven-sized */ - } else { - xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ - for (i = NUMKEYS(sp); --i >= 0; ) - xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; - len = PAGEHDRSZ; - } - sp->mp_upper = sp->mp_lower; - COPY_PGNO(sp->mp_pgno, mp->mp_pgno); - SETDSZ(node, nsize); - - /* Shift upward */ - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + delta, base, (char *)sp + len - base); - - ptr = mp->mp_ptrs[indx]; - for (i = NUMKEYS(mp); --i >= 0; ) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] += delta; - } - mp->mp_upper += delta; -} - -/** Initial setup of a sorted-dups cursor. - * Sorted duplicates are implemented as a sub-database for the given key. - * The duplicate data items are actually keys of the sub-database. - * Operations on the duplicate data items are performed using a sub-cursor - * initialized when the sub-database is first accessed. This function does - * the preliminary setup of the sub-cursor, filling in the fields that - * depend only on the parent DB. - * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. - */ -static void -mdb_xcursor_init0(MDB_cursor *mc) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - mx->mx_cursor.mc_xcursor = NULL; - mx->mx_cursor.mc_txn = mc->mc_txn; - mx->mx_cursor.mc_db = &mx->mx_db; - mx->mx_cursor.mc_dbx = &mx->mx_dbx; - mx->mx_cursor.mc_dbi = mc->mc_dbi; - mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - mx->mx_dbx.md_name.mv_size = 0; - mx->mx_dbx.md_name.mv_data = NULL; - mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; - mx->mx_dbx.md_dcmp = NULL; - mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; -} - -/** Final setup of a sorted-dups cursor. - * Sets up the fields that depend on the data from the main cursor. - * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. - * @param[in] node The data containing the #MDB_db record for the - * sorted-dup database. - */ -static void -mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - if (node->mn_flags & F_SUBDATA) { - memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); - mx->mx_cursor.mc_pg[0] = 0; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - } else { - MDB_page *fp = NODEDATA(node); - mx->mx_db.md_xsize = 0; - mx->mx_db.md_flags = 0; - mx->mx_db.md_depth = 1; - mx->mx_db.md_branch_pages = 0; - mx->mx_db.md_leaf_pages = 1; - mx->mx_db.md_overflow_pages = 0; - mx->mx_db.md_entries = NUMKEYS(fp); - COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; - mx->mx_cursor.mc_pg[0] = fp; - mx->mx_cursor.mc_ki[0] = 0; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - mx->mx_db.md_flags = MDB_DUPFIXED; - mx->mx_db.md_xsize = fp->mp_leaf2_ksize; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - mx->mx_db.md_flags |= MDB_INTEGERKEY; - } - } - mdb_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; -/* #if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) - mx->mx_dbx.md_cmp = mdb_cmp_clong; -#endif */ -} - -/** Fixup a sorted-dups cursor due to underlying update. - * Sets up some fields that depend on the data from the main cursor. - * Almost the same as init1, but skips initialization steps if the - * xcursor had already been used. - * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. - * @param[in] src_mx The xcursor of an up-to-date cursor. - * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. - */ -static void -mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - if (new_dupdata) { - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags |= C_INITIALIZED; - mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; - mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; - } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { - return; - } - mx->mx_db = src_mx->mx_db; - mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - mdb_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); -} - -/** Initialize a cursor for a given transaction and database. */ -static void -mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) -{ - mc->mc_signature = MDBX_MC_SIGNATURE; - mc->mc_next = NULL; - mc->mc_backup = NULL; - mc->mc_dbi = dbi; - mc->mc_txn = txn; - mc->mc_db = &txn->mt_dbs[dbi]; - mc->mc_dbx = &txn->mt_dbxs[dbi]; - mc->mc_dbflag = &txn->mt_dbflags[dbi]; - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_pg[0] = 0; - mc->mc_flags = 0; - mc->mc_ki[0] = 0; - mc->mc_xcursor = NULL; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - mdb_tassert(txn, mx != NULL); - mx->mx_cursor.mc_signature = MDBX_MC_SIGNATURE; - mc->mc_xcursor = mx; - mdb_xcursor_init0(mc); - } - if (unlikely(*mc->mc_dbflag & DB_STALE)) { - mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); - } -} - -int -mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) -{ - MDB_cursor *mc; - size_t size = sizeof(MDB_cursor); - - if (unlikely(!ret || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EINVAL; - - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) - size += sizeof(MDB_xcursor); - - if (likely((mc = malloc(size)) != NULL)) { - mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); - if (txn->mt_cursors) { - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; - mc->mc_flags |= C_UNTRACK; - } - } else { - return ENOMEM; - } - - *ret = mc; - - return MDB_SUCCESS; -} - -int -mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) -{ - if (unlikely(!mc || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE - && mc->mc_signature != MDBX_MC_READY4CLOSE)) - return EINVAL; - - if (unlikely(!TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))) - return EINVAL; - - if (unlikely(mc->mc_backup)) - return EINVAL; - - if (unlikely((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)) { -#if MDBX_MODE_ENABLED - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - mc->mc_signature = MDBX_MC_READY4CLOSE; -#else - return EINVAL; -#endif - } - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); - return MDB_SUCCESS; -} - -/* Return the count of duplicate data items for the current key */ -int -mdb_cursor_count(MDB_cursor *mc, size_t *countp) -{ - if (unlikely(mc == NULL || countp == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; - -#if MDBX_MODE_ENABLED - if (!mc->mc_snum) { - *countp = 0; - return MDB_NOTFOUND; - } - - MDB_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) { - *countp = 0; - return MDB_NOTFOUND; - } - - *countp = 1; - if (mc->mc_xcursor != NULL) { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - *countp = mc->mc_xcursor->mx_db.md_entries; - } - } -#else - if (unlikely(mc->mc_xcursor == NULL)) - return MDB_INCOMPATIBLE; - - if (!mc->mc_snum) - return MDB_NOTFOUND; - - MDB_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) - return MDB_NOTFOUND; - - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - *countp = 1; - } else { - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) - return EINVAL; - *countp = mc->mc_xcursor->mx_db.md_entries; - } -#endif /* MDBX_MODE_ENABLED */ - return MDB_SUCCESS; -} - -void -mdb_cursor_close(MDB_cursor *mc) -{ - if (mc) { - mdb_ensure(NULL, mc->mc_signature == MDBX_MC_SIGNATURE - || mc->mc_signature == MDBX_MC_READY4CLOSE); - if (!mc->mc_backup) { - /* Remove from txn, if tracked. - * A read-only txn (!C_UNTRACK) may have been freed already, - * so do not peek inside it. Only write txns track cursors. */ - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - } - mc->mc_signature = 0; - free(mc); - } else { - /* cursor closed before nested txn ends */ - mdb_cassert(mc, mc->mc_signature == MDBX_MC_SIGNATURE); - mc->mc_signature = MDBX_MC_WAIT4EOT; - } - } -} - -MDB_txn * -mdb_cursor_txn(MDB_cursor *mc) -{ - if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) - return NULL; - return mc->mc_txn; -} - -MDB_dbi -mdb_cursor_dbi(MDB_cursor *mc) -{ - if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) - return INT_MIN; - return mc->mc_dbi; -} - -/** Replace the key for a branch node with a new key. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc Cursor pointing to the node to operate on. - * @param[in] key The new key to use. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_update_key(MDB_cursor *mc, MDB_val *key) -{ - MDB_page *mp; - MDB_node *node; - char *base; - size_t len; - int delta, ksize, oksize; - indx_t ptr, i, numkeys, indx; - DKBUF; - - indx = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node = NODEPTR(mp, indx); - ptr = mp->mp_ptrs[indx]; - { - MDB_val k2; - char kbuf2[DKBUF_MAXKEYSIZE*2+1]; - k2.mv_data = NODEKEY(node); - k2.mv_size = node->mn_ksize; - mdb_debug("update key %u (ofs %u) [%s] to [%s] on page %zu", - indx, ptr, - mdb_dkey(&k2, kbuf2), - DKEY(key), - mp->mp_pgno); - } - - /* Sizes must be 2-byte aligned. */ - ksize = EVEN(key->mv_size); - oksize = EVEN(node->mn_ksize); - delta = ksize - oksize; - - /* Shift node contents if EVEN(key length) changed. */ - if (delta) { - if (delta > 0 && SIZELEFT(mp) < delta) { - pgno_t pgno; - /* not enough space left, do a delete and split */ - mdb_debug("Not enough room, delta = %d, splitting...", delta); - pgno = NODEPGNO(node); - mdb_node_del(mc, 0); - return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); - } - - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] -= delta; - } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); - mp->mp_upper -= delta; - - node = NODEPTR(mp, indx); - } - - /* But even if no shift was needed, update ksize */ - if (node->mn_ksize != key->mv_size) - node->mn_ksize = key->mv_size; - - if (key->mv_size) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - return MDB_SUCCESS; -} - -static void -mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); - -/** Perform \b act while tracking temporary cursor \b mn */ -#define WITH_CURSOR_TRACKING(mn, act) do { \ - MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ - if ((mn).mc_flags & C_SUB) { \ - dummy.mc_flags = C_INITIALIZED; \ - dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ - tracked = &dummy; \ - } else { \ - tracked = &(mn); \ - } \ - tracked->mc_next = *tp; \ - *tp = tracked; \ - { act; } \ - *tp = tracked->mc_next; \ -} while (0) - -/** Move a node from csrc to cdst. - */ -static int -mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) -{ - MDB_node *srcnode; - MDB_val key, data; - pgno_t srcpg; - MDB_cursor mn; - int rc; - unsigned short flags; - - DKBUF; - - /* Mark src and dst as dirty. */ - if (unlikely((rc = mdb_page_touch(csrc)) || - (rc = mdb_page_touch(cdst)))) - return rc; - - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); - data.mv_size = 0; - data.mv_data = NULL; - srcpg = 0; - flags = 0; - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); - mdb_cassert(csrc, !((size_t)srcnode & 1)); - srcpg = NODEPGNO(srcnode); - flags = srcnode->mn_flags; - if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned snum = csrc->mc_snum; - MDB_node *s2; - /* must find the lowest key below src */ - rc = mdb_page_search_lowest(csrc); - if (unlikely(rc)) - return rc; - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - csrc->mc_snum = snum--; - csrc->mc_top = snum; - } else { - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - } - mn.mc_xcursor = NULL; - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { - unsigned snum = cdst->mc_snum; - MDB_node *s2; - MDB_val bkey; - /* must find the lowest key below dst */ - mdb_cursor_copy(cdst, &mn); - rc = mdb_page_search_lowest(&mn); - if (unlikely(rc)) - return rc; - if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - bkey.mv_size = mn.mc_db->md_xsize; - bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); - } else { - s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - bkey.mv_size = NODEKSZ(s2); - bkey.mv_data = NODEKEY(s2); - } - mn.mc_snum = snum--; - mn.mc_top = snum; - mn.mc_ki[snum] = 0; - rc = mdb_update_key(&mn, &bkey); - if (unlikely(rc)) - return rc; - } - - mdb_debug("moving %s node %u [%s] on page %zu to node %u on page %zu", - IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", - csrc->mc_ki[csrc->mc_top], - DKEY(&key), - csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno); - - /* Add the node to the destination page. */ - rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - - /* Delete the node from the source page. */ - mdb_node_del(csrc, key.mv_size); - - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mpd, *mps; - - mps = csrc->mc_pg[csrc->mc_top]; - /* If we're adding on the left, bump others up */ - if (fromleft) { - mpd = cdst->mc_pg[csrc->mc_top]; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3 != cdst && - m3->mc_pg[csrc->mc_top] == mpd && - m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { - m3->mc_ki[csrc->mc_top]++; - } - if (m3 !=csrc && - m3->mc_pg[csrc->mc_top] == mps && - m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - m3->mc_ki[csrc->mc_top-1]++; - } - if (XCURSOR_INITED(m3) && IS_LEAF(mps)) - XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - } - } else - /* Adding on the right, bump others down */ - { - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3->mc_pg[csrc->mc_top] == mps) { - if (!m3->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - m3->mc_ki[csrc->mc_top-1]--; - } else { - m3->mc_ki[csrc->mc_top]--; - } - if (XCURSOR_INITED(m3) && IS_LEAF(mps)) - XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - } - } - } - } - - /* Update the parent separators. */ - if (csrc->mc_ki[csrc->mc_top] == 0) { - if (csrc->mc_ki[csrc->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - mdb_debug("update separator for source page %zu to [%s]", - csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); - mdb_cursor_copy(csrc, &mn); - mn.mc_snum--; - mn.mc_top--; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdb_update_key(&mn, &key)); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - MDB_val nullkey; - indx_t ix = csrc->mc_ki[csrc->mc_top]; - nullkey.mv_size = 0; - csrc->mc_ki[csrc->mc_top] = 0; - rc = mdb_update_key(csrc, &nullkey); - csrc->mc_ki[csrc->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - if (cdst->mc_ki[cdst->mc_top] == 0) { - if (cdst->mc_ki[cdst->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - mdb_debug("update separator for destination page %zu to [%s]", - cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); - mdb_cursor_copy(cdst, &mn); - mn.mc_snum--; - mn.mc_top--; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdb_update_key(&mn, &key)); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { - MDB_val nullkey; - indx_t ix = cdst->mc_ki[cdst->mc_top]; - nullkey.mv_size = 0; - cdst->mc_ki[cdst->mc_top] = 0; - rc = mdb_update_key(cdst, &nullkey); - cdst->mc_ki[cdst->mc_top] = ix; - mdb_cassert(cdst, rc == MDB_SUCCESS); - } - } - - return MDB_SUCCESS; -} - -/** Merge one page into another. - * The nodes from the page pointed to by \b csrc will - * be copied to the page pointed to by \b cdst and then - * the \b csrc page will be freed. - * @param[in] csrc Cursor pointing to the source page. - * @param[in] cdst Cursor pointing to the destination page. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) -{ - MDB_page *psrc, *pdst; - MDB_node *srcnode; - MDB_val key, data; - unsigned nkeys; - int rc; - indx_t i, j; - - psrc = csrc->mc_pg[csrc->mc_top]; - pdst = cdst->mc_pg[cdst->mc_top]; - - mdb_debug("merging page %zu into %zu", psrc->mp_pgno, pdst->mp_pgno); - - mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ - mdb_cassert(csrc, cdst->mc_snum > 1); - - /* Mark dst as dirty. */ - if (unlikely(rc = mdb_page_touch(cdst))) - return rc; - - /* get dst page again now that we've touched it. */ - pdst = cdst->mc_pg[cdst->mc_top]; - - /* Move all nodes from src to dst. - */ - j = nkeys = NUMKEYS(pdst); - if (IS_LEAF2(psrc)) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = PAGEDATA(psrc); - for (i = 0; i < NUMKEYS(psrc); i++, j++) { - rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - key.mv_data = (char *)key.mv_data + key.mv_size; - } - } else { - for (i = 0; i < NUMKEYS(psrc); i++, j++) { - srcnode = NODEPTR(psrc, i); - if (i == 0 && IS_BRANCH(psrc)) { - MDB_cursor mn; - MDB_node *s2; - mdb_cursor_copy(csrc, &mn); - mn.mc_xcursor = NULL; - /* must find the lowest key below src */ - rc = mdb_page_search_lowest(&mn); - if (unlikely(rc)) - return rc; - if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - key.mv_size = mn.mc_db->md_xsize; - key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - } else { - key.mv_size = srcnode->mn_ksize; - key.mv_data = NODEKEY(srcnode); - } - - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } - - mdb_debug("dst page %zu now has %u keys (%.1f%% filled)", - pdst->mp_pgno, NUMKEYS(pdst), - (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); - - /* Unlink the src page from parent and add to free list. - */ - csrc->mc_top--; - mdb_node_del(csrc, 0); - if (csrc->mc_ki[csrc->mc_top] == 0) { - key.mv_size = 0; - rc = mdb_update_key(csrc, &key); - if (unlikely(rc)) { - csrc->mc_top++; - return rc; - } - } - csrc->mc_top++; - - psrc = csrc->mc_pg[csrc->mc_top]; - /* If not operating on FreeDB, allow this page to be reused - * in this txn. Otherwise just add to free list. - */ - rc = mdb_page_loose(csrc, psrc); - if (unlikely(rc)) - return rc; - if (IS_LEAF(psrc)) - csrc->mc_db->md_leaf_pages--; - else - csrc->mc_db->md_branch_pages--; - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - unsigned top = csrc->mc_top; - - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[top] == psrc) { - m3->mc_pg[top] = pdst; - m3->mc_ki[top] += nkeys; - m3->mc_ki[top-1] = cdst->mc_ki[top-1]; - } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && - m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { - m3->mc_ki[top-1]--; - } - if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) - XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); - } - } - { - unsigned snum = cdst->mc_snum; - uint16_t depth = cdst->mc_db->md_depth; - mdb_cursor_pop(cdst); - rc = mdb_rebalance(cdst); - /* Did the tree height change? */ - if (depth != cdst->mc_db->md_depth) - snum += cdst->mc_db->md_depth - depth; - cdst->mc_snum = snum; - cdst->mc_top = snum-1; - } - return rc; -} - -/** Copy the contents of a cursor. - * @param[in] csrc The cursor to copy from. - * @param[out] cdst The cursor to copy to. - */ -static void -mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) -{ - unsigned i; - - cdst->mc_txn = csrc->mc_txn; - cdst->mc_dbi = csrc->mc_dbi; - cdst->mc_db = csrc->mc_db; - cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_snum = csrc->mc_snum; - cdst->mc_top = csrc->mc_top; - cdst->mc_flags = csrc->mc_flags; - - for (i=0; imc_snum; i++) { - cdst->mc_pg[i] = csrc->mc_pg[i]; - cdst->mc_ki[i] = csrc->mc_ki[i]; - } -} - -/** Rebalance the tree after a delete operation. - * @param[in] mc Cursor pointing to the page where rebalancing - * should begin. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_rebalance(MDB_cursor *mc) -{ - MDB_node *node; - int rc, fromleft; - unsigned ptop, minkeys, thresh; - MDB_cursor mn; - indx_t oldki; - - if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { - minkeys = 2; - thresh = 1; - } else { - minkeys = 1; - thresh = FILL_THRESHOLD; - } - mdb_debug("rebalancing %s page %zu (has %u keys, %.1f%% full)", - IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10); - - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && - NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - mdb_debug("no need to rebalance page %zu, above fill threshold", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top])); - return MDB_SUCCESS; - } - - if (mc->mc_snum < 2) { - MDB_page *mp = mc->mc_pg[0]; - if (IS_SUBP(mp)) { - mdb_debug("Can't rebalance a subpage, ignoring"); - return MDB_SUCCESS; - } - if (NUMKEYS(mp) == 0) { - mdb_debug("tree is completely empty"); - mc->mc_db->md_root = P_INVALID; - mc->mc_db->md_depth = 0; - mc->mc_db->md_leaf_pages = 0; - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (unlikely(rc)) - return rc; - /* Adjust cursors pointing to mp */ - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) - continue; - if (m3->mc_pg[0] == mp) { - m3->mc_snum = 0; - m3->mc_top = 0; - m3->mc_flags &= ~C_INITIALIZED; - } - } - } - } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { - int i; - mdb_debug("collapsing root page!"); - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (unlikely(rc)) - return rc; - mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); - if (unlikely(rc)) - return rc; - mc->mc_db->md_depth--; - mc->mc_db->md_branch_pages--; - mc->mc_ki[0] = mc->mc_ki[1]; - for (i = 1; imc_db->md_depth; i++) { - mc->mc_pg[i] = mc->mc_pg[i+1]; - mc->mc_ki[i] = mc->mc_ki[i+1]; - } - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc) continue; - if (!(m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_pg[0] == mp) { - for (i=0; imc_db->md_depth; i++) { - m3->mc_pg[i] = m3->mc_pg[i+1]; - m3->mc_ki[i] = m3->mc_ki[i+1]; - } - m3->mc_snum--; - m3->mc_top--; - } - } - } - } else - mdb_debug("root page doesn't need rebalancing"); - return MDB_SUCCESS; - } - - /* The parent (branch page) must have at least 2 pointers, - * otherwise the tree is invalid. - */ - ptop = mc->mc_top-1; - mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); - - /* Leaf page fill factor is below the threshold. - * Try to move keys from left or right neighbor, or - * merge with a neighbor page. - */ - - /* Find neighbors. - */ - mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; - - oldki = mc->mc_ki[mc->mc_top]; - if (mc->mc_ki[ptop] == 0) { - /* We're the leftmost leaf in our parent. - */ - mdb_debug("reading right neighbor"); - mn.mc_ki[ptop]++; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); - if (unlikely(rc)) - return rc; - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - fromleft = 0; - } else { - /* There is at least one neighbor to the left. - */ - mdb_debug("reading left neighbor"); - mn.mc_ki[ptop]--; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); - if (unlikely(rc)) - return rc; - mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; - mc->mc_ki[mc->mc_top] = 0; - fromleft = 1; - } - - mdb_debug("found neighbor page %zu (%u keys, %.1f%% full)", - mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); - - /* If the neighbor page is above threshold and has enough keys, - * move one key from it. Otherwise we should try to merge them. - * (A branch page must never have less than 2 keys.) - */ - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { - rc = mdb_node_move(&mn, mc, fromleft); - if (fromleft) { - /* if we inserted on left, bump position up */ - oldki++; - } - } else { - if (!fromleft) { - rc = mdb_page_merge(&mn, mc); - } else { - oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); - mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, - rc = mdb_page_merge(mc, &mn)); - mdb_cursor_copy(&mn, mc); - } - mc->mc_flags &= ~C_EOF; - } - mc->mc_ki[mc->mc_top] = oldki; - return rc; -} - -/** Complete a delete operation started by #mdb_cursor_del(). */ -static int -mdb_cursor_del0(MDB_cursor *mc) -{ - int rc; - MDB_page *mp; - indx_t ki; - unsigned nkeys; - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - mdb_node_del(mc, mc->mc_db->md_xsize); - mc->mc_db->md_entries--; - { - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3 == mc || m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } - } - rc = mdb_rebalance(mc); - - if (likely(rc == MDB_SUCCESS)) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by mdb_rebalance and aren't needed here. - */ - if (!mc->mc_snum) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - nkeys = NUMKEYS(mp); - - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; - continue; - } - } - if (mc->mc_db->md_flags & MDB_DUPSORT) { - MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node is a fake page, it needs to be reinited - * because its data has moved. But just reset mc_pg[0] - * if the xcursor is already live. - */ - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - else - mdb_xcursor_init1(m3, node); - } - } - } - } - } - mc->mc_flags |= C_DEL; - } - - if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_del(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data) -{ - if (unlikely(!key || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - -#if ! MDBX_MODE_ENABLED - if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { - /* must ignore any data */ - data = NULL; - } -#endif - - return mdb_del0(txn, dbi, key, data, 0); -} - -static int -mdb_del0(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, unsigned flags) -{ - MDB_cursor mc; - MDB_xcursor mx; - MDB_cursor_op op; - MDB_val rdata; - int rc, exact = 0; - DKBUF; - - mdb_debug("====> delete db %u key [%s]", dbi, DKEY(key)); - - mdb_cursor_init(&mc, txn, dbi, &mx); - - if (data) { - op = MDB_GET_BOTH; - rdata = *data; - data = &rdata; - } else { - op = MDB_SET; - flags |= MDB_NODUPDATA; - } - rc = mdb_cursor_set(&mc, key, data, op, &exact); - if (likely(rc == 0)) { - /* let mdb_page_split know about this cursor if needed: - * delete will trigger a rebalance; if it needs to move - * a node from one page to another, it will have to - * update the parent's separator key(s). If the new sepkey - * is larger than the current one, the parent page may - * run out of space, triggering a split. We need this - * cursor to be consistent until the end of the rebalance. - */ - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdb_cursor_del(&mc, flags); - txn->mt_cursors[dbi] = mc.mc_next; - } - return rc; -} - -/** Split a page and insert a new node. - * Set #MDB_TXN_ERROR on failure. - * @param[in,out] mc Cursor pointing to the page and desired insertion index. - * The cursor will be updated to point to the actual page and index where - * the node got inserted after the split. - * @param[in] newkey The key for the newly inserted node. - * @param[in] newdata The data for the newly inserted node. - * @param[in] newpgno The page number, if the new node is a branch node. - * @param[in] nflags The #NODE_ADD_FLAGS for the new node. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, - unsigned nflags) -{ - unsigned flags; - int rc = MDB_SUCCESS, new_root = 0, did_split = 0; - indx_t newindx; - pgno_t pgno = 0; - int i, j, split_indx, nkeys, pmax; - MDB_env *env = mc->mc_txn->mt_env; - MDB_node *node; - MDB_val sepkey, rkey, xdata, *rdata = &xdata; - MDB_page *copy = NULL; - MDB_page *mp, *rp, *pp; - int ptop; - MDB_cursor mn; - DKBUF; - - mp = mc->mc_pg[mc->mc_top]; - newindx = mc->mc_ki[mc->mc_top]; - nkeys = NUMKEYS(mp); - - mdb_debug("-----> splitting %s page %zu and adding [%s] at index %i/%i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, - DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys); - - /* Create a right sibling. */ - if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) - return rc; - rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; - mdb_debug("new right sibling: page %zu", rp->mp_pgno); - - /* Usually when splitting the root page, the cursor - * height is 1. But when called from mdb_update_key, - * the cursor height may be greater because it walks - * up the stack while finding the branch slot to update. - */ - if (mc->mc_top < 1) { - if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) - goto done; - /* shift current top to make room for new parent */ - for (i=mc->mc_snum; i>0; i--) { - mc->mc_pg[i] = mc->mc_pg[i-1]; - mc->mc_ki[i] = mc->mc_ki[i-1]; - } - mc->mc_pg[0] = pp; - mc->mc_ki[0] = 0; - mc->mc_db->md_root = pp->mp_pgno; - mdb_debug("root split! new root = %zu", pp->mp_pgno); - new_root = mc->mc_db->md_depth++; - - /* Add left (implicit) pointer. */ - if (unlikely((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS)) { - /* undo the pre-push */ - mc->mc_pg[0] = mc->mc_pg[1]; - mc->mc_ki[0] = mc->mc_ki[1]; - mc->mc_db->md_root = mp->mp_pgno; - mc->mc_db->md_depth--; - goto done; - } - mc->mc_snum++; - mc->mc_top++; - ptop = 0; - } else { - ptop = mc->mc_top-1; - mdb_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); - } - - mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; - mn.mc_pg[mn.mc_top] = rp; - mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; - - if (nflags & MDB_APPEND) { - mn.mc_ki[mn.mc_top] = 0; - sepkey = *newkey; - split_indx = newindx; - nkeys = 0; - } else { - split_indx = (nkeys+1) / 2; - - if (IS_LEAF2(rp)) { - char *split, *ins; - int x; - unsigned lsize, rsize, ksize; - /* Move half of the keys to the right sibling */ - x = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_xsize; - split = LEAF2KEY(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); - mp->mp_lower -= lsize; - rp->mp_lower += lsize; - mp->mp_upper += rsize - lsize; - rp->mp_upper -= rsize - lsize; - sepkey.mv_size = ksize; - if (newindx == split_indx) { - sepkey.mv_data = newkey->mv_data; - } else { - sepkey.mv_data = split; - } - if (x<0) { - ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); - memcpy(rp->mp_ptrs, split, rsize); - sepkey.mv_data = rp->mp_ptrs; - memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->mv_data, ksize); - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - } else { - if (x) - memcpy(rp->mp_ptrs, split, x * ksize); - ins = LEAF2KEY(rp, x, ksize); - memcpy(ins, newkey->mv_data, ksize); - memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); - rp->mp_lower += sizeof(indx_t); - rp->mp_upper -= ksize - sizeof(indx_t); - mc->mc_ki[mc->mc_top] = x; - } - } else { - int psize, nsize, k; - /* Maximum free space in an empty page */ - pmax = env->me_psize - PAGEHDRSZ; - if (IS_LEAF(mp)) - nsize = mdb_leaf_size(env, newkey, newdata); - else - nsize = mdb_branch_size(env, newkey); - nsize = EVEN(nsize); - - /* grab a page to hold a temporary copy */ - copy = mdb_page_malloc(mc->mc_txn, 1); - if (unlikely(copy == NULL)) { - rc = ENOMEM; - goto done; - } - copy->mp_pgno = mp->mp_pgno; - copy->mp_flags = mp->mp_flags; - copy->mp_lower = (PAGEHDRSZ-PAGEBASE); - copy->mp_upper = env->me_psize - PAGEBASE; - - /* prepare to insert */ - for (i=0, j=0; imp_ptrs[j++] = 0; - } - copy->mp_ptrs[j++] = mp->mp_ptrs[i]; - } - - /* When items are relatively large the split point needs - * to be checked, because being off-by-one will make the - * difference between success or failure in mdb_node_add. - * - * It's also relevant if a page happens to be laid out - * such that one half of its nodes are all "small" and - * the other half of its nodes are "large." If the new - * item is also "large" and falls on the half with - * "large" nodes, it also may not fit. - * - * As a final tweak, if the new item goes on the last - * spot on the page (and thus, onto the new page), bias - * the split so the new page is emptier than the old page. - * This yields better packing during sequential inserts. - */ - if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { - /* Find split point */ - psize = 0; - if (newindx <= split_indx || newindx >= nkeys) { - i = 0; j = 1; - k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); - } else { - i = nkeys; j = -1; - k = split_indx-1; - } - for (; i!=k; i+=j) { - if (i == newindx) { - psize += nsize; - node = NULL; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); - psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - } - psize = EVEN(psize); - } - if (psize > pmax || i == k-j) { - split_indx = i + (j<0); - break; - } - } - } - if (split_indx == newindx) { - sepkey.mv_size = newkey->mv_size; - sepkey.mv_data = newkey->mv_data; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); - sepkey.mv_size = node->mn_ksize; - sepkey.mv_data = NODEKEY(node); - } - } - } - - mdb_debug("separator is %d [%s]", split_indx, DKEY(&sepkey)); - - /* Copy separator key to the parent. */ - if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { - int snum = mc->mc_snum; - mn.mc_snum--; - mn.mc_top--; - did_split = 1; - /* We want other splits to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); - if (unlikely(rc != MDB_SUCCESS)) - goto done; - - /* root split? */ - if (mc->mc_snum > snum) { - ptop++; - } - /* Right page might now have changed parent. - * Check if left page also changed parent. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; imc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - if (mn.mc_ki[ptop]) { - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; - } else { - /* find right page's left sibling */ - mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = mdb_cursor_sibling(mc, 0); - } - } - } else { - mn.mc_top--; - rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); - mn.mc_top++; - } - if (unlikely(rc != MDB_SUCCESS)) { - if (rc == MDB_NOTFOUND) /* improper mdb_cursor_sibling() result */ - rc = MDB_PROBLEM; - goto done; - } - if (nflags & MDB_APPEND) { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[mc->mc_top] = 0; - rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); - if (rc) - goto done; - for (i=0; imc_top; i++) - mc->mc_ki[i] = mn.mc_ki[i]; - } else if (!IS_LEAF2(mp)) { - /* Move nodes */ - mc->mc_pg[mc->mc_top] = rp; - i = split_indx; - j = 0; - do { - if (i == newindx) { - rkey.mv_data = newkey->mv_data; - rkey.mv_size = newkey->mv_size; - if (IS_LEAF(mp)) { - rdata = newdata; - } else - pgno = newpgno; - flags = nflags; - /* Update index for the new key. */ - mc->mc_ki[mc->mc_top] = j; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); - rkey.mv_data = NODEKEY(node); - rkey.mv_size = node->mn_ksize; - if (IS_LEAF(mp)) { - xdata.mv_data = NODEDATA(node); - xdata.mv_size = NODEDSZ(node); - rdata = &xdata; - } else - pgno = NODEPGNO(node); - flags = node->mn_flags; - } - - if (!IS_LEAF(mp) && j == 0) { - /* First branch index doesn't need key data. */ - rkey.mv_size = 0; - } - - rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); - if (rc) - goto done; - if (i == nkeys) { - i = 0; - j = 0; - mc->mc_pg[mc->mc_top] = copy; - } else { - i++; - j++; - } - } while (i != split_indx); - - nkeys = NUMKEYS(copy); - for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; - mp->mp_lower = copy->mp_lower; - mp->mp_upper = copy->mp_upper; - memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), - env->me_psize - copy->mp_upper - PAGEBASE); - - /* reset back to original page */ - if (newindx < split_indx) { - mc->mc_pg[mc->mc_top] = mp; - } else { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; i<=ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - } - } - if (nflags & MDB_RESERVE) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } - } else { - if (newindx >= split_indx) { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; i<=ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - } - } - } - - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - nkeys = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc) - continue; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (new_root) { - int k; - /* sub cursors may be on different DB */ - if (m3->mc_pg[0] != mp) - continue; - /* root split */ - for (k=new_root; k>=0; k--) { - m3->mc_ki[k+1] = m3->mc_ki[k]; - m3->mc_pg[k+1] = m3->mc_pg[k]; - } - if (m3->mc_ki[0] >= nkeys) { - m3->mc_ki[0] = 1; - } else { - m3->mc_ki[0] = 0; - } - m3->mc_pg[0] = mc->mc_pg[0]; - m3->mc_snum++; - m3->mc_top++; - } - if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) - m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= nkeys) { - m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= nkeys; - for (i=0; imc_top; i++) { - m3->mc_ki[i] = mn.mc_ki[i]; - m3->mc_pg[i] = mn.mc_pg[i]; - } - } - } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && - m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { - m3->mc_ki[ptop]++; - } - if (XCURSOR_INITED(m3) && IS_LEAF(mp)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } - mdb_debug("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)); - -done: - if (copy) /* tmp page */ - mdb_page_free(env, copy); - if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_put(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, unsigned flags) -{ - MDB_cursor mc; - MDB_xcursor mx; - - if (unlikely(!key || !data || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP - /* LY: MDB_CURRENT indicates explicit overwrite (update) for MDBX */ - | (MDBX_MODE_ENABLED ? MDB_CURRENT : 0)))) - return EINVAL; - - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - mdb_cursor_init(&mc, txn, dbi, &mx); - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - int rc = MDB_SUCCESS; -#if MDBX_MODE_ENABLED - /* LY: support for update (explicit overwrite) */ - if (flags & MDB_CURRENT) { - rc = mdb_cursor_get(&mc, key, NULL, MDB_SET); - if (likely(rc == MDB_SUCCESS) && (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)) { - /* LY: allows update (explicit overwrite) only for unique keys */ - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_tassert(txn, XCURSOR_INITED(&mc) && mc.mc_xcursor->mx_db.md_entries > 1); - rc = MDBX_EMULTIVAL; - } - } - } -#endif /* MDBX_MODE_ENABLED */ - if (likely(rc == MDB_SUCCESS)) - rc = mdb_cursor_put(&mc, key, data, flags); - txn->mt_cursors[dbi] = mc.mc_next; - - return rc; -} - -#ifndef MDB_WBUF -# define MDB_WBUF (1024*1024) -#endif -#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ - - /** State needed for a double-buffering compacting copy. */ -typedef struct mdb_copy { - MDB_env *mc_env; - MDB_txn *mc_txn; - pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ - char *mc_wbuf[2]; - char *mc_over[2]; - int mc_wlen[2]; - int mc_olen[2]; - pgno_t mc_next_pgno; - HANDLE mc_fd; - int mc_toggle; /**< Buffer number in provider */ - int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ - /** Error code. Never cleared if set. Both threads can set nonzero - * to fail the copy. Not mutex-protected, LMDB expects atomic int. - */ - volatile int mc_error; -} mdb_copy; - - /** Dedicated writer thread for compacting copy. */ -static void* __cold -mdb_env_copythr(void *arg) -{ - mdb_copy *my = arg; - char *ptr; - int toggle = 0, wsize, rc = 0; - int len; - -#ifdef SIGPIPE - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGPIPE); - if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) - my->mc_error = rc; -#endif - - pthread_mutex_lock(&my->mc_mutex); - for(;;) { - while (!my->mc_new) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ - break; - wsize = my->mc_wlen[toggle]; - ptr = my->mc_wbuf[toggle]; -again: - rc = MDB_SUCCESS; - while (wsize > 0 && !my->mc_error) { - len = write(my->mc_fd, ptr, wsize); - if (len < 0) { - rc = errno; -#ifdef SIGPIPE - if (rc == EPIPE) { - /* Collect the pending SIGPIPE, otherwise at least OS X - * gives it to the process on thread-exit (ITS#8504). - */ - int tmp; - sigwait(&set, &tmp); - } -#endif - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - if (rc) { - my->mc_error = rc; - } - /* If there's an overflow page tail, write it too */ - if (my->mc_olen[toggle]) { - wsize = my->mc_olen[toggle]; - ptr = my->mc_over[toggle]; - my->mc_olen[toggle] = 0; - goto again; - } - my->mc_wlen[toggle] = 0; - toggle ^= 1; - /* Return the empty buffer to provider */ - my->mc_new--; - pthread_cond_signal(&my->mc_cond); - } - pthread_mutex_unlock(&my->mc_mutex); - return NULL; -} - - /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. - * - * @param[in] my control structure. - * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). - */ -static int __cold -mdb_env_cthr_toggle(mdb_copy *my, int adjust) -{ - pthread_mutex_lock(&my->mc_mutex); - my->mc_new += adjust; - pthread_cond_signal(&my->mc_cond); - while (my->mc_new & 2) /* both buffers in use */ - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - pthread_mutex_unlock(&my->mc_mutex); - - my->mc_toggle ^= (adjust & 1); - /* Both threads reset mc_wlen, to be safe from threading errors */ - my->mc_wlen[my->mc_toggle] = 0; - return my->mc_error; -} - - /** Depth-first tree traversal for compacting copy. - * @param[in] my control structure. - * @param[in,out] pg database root. - * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. - */ -static int __cold -mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) -{ - MDB_cursor mc; - MDB_node *ni; - MDB_page *mo, *mp, *leaf; - char *buf, *ptr; - int rc, toggle; - unsigned i; - - /* Empty DB, nothing to do */ - if (*pg == P_INVALID) - return MDB_SUCCESS; - - memset(&mc, 0, sizeof(mc)); - mc.mc_snum = 1; - mc.mc_txn = my->mc_txn; - - rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); - if (rc) - return rc; - rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); - if (rc) - return rc; - - /* Make cursor pages writable */ - buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); - if (buf == NULL) - return ENOMEM; - - for (i=0; imc_env->me_psize); - mc.mc_pg[i] = (MDB_page *)ptr; - ptr += my->mc_env->me_psize; - } - - /* This is writable space for a leaf page. Usually not needed. */ - leaf = (MDB_page *)ptr; - - toggle = my->mc_toggle; - while (mc.mc_snum > 0) { - unsigned n; - mp = mc.mc_pg[mc.mc_top]; - n = NUMKEYS(mp); - - if (IS_LEAF(mp)) { - if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - - /* Need writable leaf */ - if (mp != leaf) { - mc.mc_pg[mc.mc_top] = leaf; - mdb_page_copy(leaf, mp, my->mc_env->me_psize); - mp = leaf; - ni = NODEPTR(mp, i); - } - - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); - rc = mdb_page_get(&mc, pg, &omp, NULL); - if (rc) - goto done; - if (my->mc_wlen[toggle] >= MDB_WBUF) { - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - memcpy(mo, omp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += omp->mp_pages; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (omp->mp_pages > 1) { - my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); - my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - } else if (ni->mn_flags & F_SUBDATA) { - MDB_db db; - - /* Need writable leaf */ - if (mp != leaf) { - mc.mc_pg[mc.mc_top] = leaf; - mdb_page_copy(leaf, mp, my->mc_env->me_psize); - mp = leaf; - ni = NODEPTR(mp, i); - } - - memcpy(&db, NODEDATA(ni), sizeof(db)); - my->mc_toggle = toggle; - rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); - if (rc) - goto done; - toggle = my->mc_toggle; - memcpy(NODEDATA(ni), &db, sizeof(db)); - } - } - } - } else { - mc.mc_ki[mc.mc_top]++; - if (mc.mc_ki[mc.mc_top] < n) { - pgno_t pg; -again: - ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); - pg = NODEPGNO(ni); - rc = mdb_page_get(&mc, pg, &mp, NULL); - if (rc) - goto done; - mc.mc_top++; - mc.mc_snum++; - mc.mc_ki[mc.mc_top] = 0; - if (IS_BRANCH(mp)) { - /* Whenever we advance to a sibling branch page, - * we must proceed all the way down to its first leaf. - */ - mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); - goto again; - } else - mc.mc_pg[mc.mc_top] = mp; - continue; - } - } - if (my->mc_wlen[toggle] >= MDB_WBUF) { - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - mdb_page_copy(mo, mp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno++; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (mc.mc_top) { - /* Update parent if there is one */ - ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); - SETPGNO(ni, mo->mp_pgno); - mdb_cursor_pop(&mc); - } else { - /* Otherwise we're done */ - *pg = mo->mp_pgno; - break; - } - } -done: - free(buf); - return rc; -} - - /** Copy environment with compaction. */ -static int __cold -mdb_env_copyfd1(MDB_env *env, HANDLE fd) -{ - MDB_meta *mm; - MDB_page *mp; - mdb_copy my; - MDB_txn *txn = NULL; - pthread_t thr; - pgno_t root, new_root; - int rc = MDB_SUCCESS; - - memset(&my, 0, sizeof(my)); - if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) - return rc; - if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) - goto done2; - my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); - if (my.mc_wbuf[0] == NULL) { - rc = errno; - goto done; - } - memset(my.mc_wbuf[0], 0, MDB_WBUF*2); - my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; - my.mc_next_pgno = NUM_METAS; - my.mc_env = env; - my.mc_fd = fd; - rc = pthread_create(&thr, NULL, mdb_env_copythr, &my); - if (rc) - goto done; - - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - goto finish; - - mp = (MDB_page *)my.mc_wbuf[0]; - memset(mp, 0, NUM_METAS * env->me_psize); - mp->mp_pgno = 0; - mp->mp_flags = P_META; - mm = (MDB_meta *)PAGEDATA(mp); - mdb_env_init_meta0(env, mm); - mm->mm_address = METAPAGE_1(env)->mm_address; - - mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); - mp->mp_pgno = 1; - mp->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(mp) = *mm; - mm = (MDB_meta *)PAGEDATA(mp); - - /* Set metapage 1 with current main DB */ - root = new_root = txn->mt_dbs[MAIN_DBI].md_root; - if (root != P_INVALID) { - /* Count free pages + freeDB pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. - */ - MDB_ID freecount = 0; - MDB_cursor mc; - MDB_val key, data; - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; - if (rc != MDB_NOTFOUND) - goto finish; - freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + - txn->mt_dbs[FREE_DBI].md_leaf_pages + - txn->mt_dbs[FREE_DBI].md_overflow_pages; - - new_root = txn->mt_next_pgno - 1 - freecount; - mm->mm_last_pg = new_root; - mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - mm->mm_dbs[MAIN_DBI].md_root = new_root; - } else { - /* When the DB is empty, handle it specially to - * fix any breakage like page leaks from ITS#8174. - */ - mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; - } - if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { - mm->mm_txnid = 1; /* use metapage 1 */ - } - - my.mc_wlen[0] = env->me_psize * NUM_METAS; - my.mc_txn = txn; - rc = mdb_env_cwalk(&my, &root, 0); - if (rc == MDB_SUCCESS && root != new_root) { - rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ - } - -finish: - if (rc) - my.mc_error = rc; - mdb_env_cthr_toggle(&my, 1 | MDB_EOF); - rc = pthread_join(thr, NULL); - mdb_txn_abort(txn); - -done: - free(my.mc_wbuf[0]); - pthread_cond_destroy(&my.mc_cond); -done2: - pthread_mutex_destroy(&my.mc_mutex); - return rc ? rc : my.mc_error; -} - - /** Copy environment as-is. */ -static int __cold -mdb_env_copyfd0(MDB_env *env, HANDLE fd) -{ - MDB_txn *txn = NULL; - pthread_mutex_t *wmutex = NULL; - int rc; - size_t wsize; - char *ptr; - ssize_t len; - size_t w2; - - /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. - */ - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - return rc; - - /* We must start the actual read txn after blocking writers */ - rc = mdb_txn_end(txn, MDB_END_RESET_TMP); - if (rc) - return rc; - - /* Temporarily block writers until we snapshot the meta pages */ - wmutex = MDB_MUTEX(env, w); - rc = mdb_mutex_lock(env, wmutex); - if (unlikely(rc)) - goto leave; - - rc = mdb_txn_renew0(txn, MDB_RDONLY); - if (rc) { - mdb_mutex_unlock(env, wmutex); - goto leave; - } - - wsize = env->me_psize * NUM_METAS; - ptr = env->me_map; - w2 = wsize; - while (w2 > 0) { - len = write(fd, ptr, w2); - if (len < 0) { - rc = errno; - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - w2 -= len; - continue; - } else { - /* Non-blocking or async handles are not supported */ - rc = EIO; - break; - } - } - mdb_mutex_unlock(env, wmutex); - - if (rc) - goto leave; - - w2 = txn->mt_next_pgno * env->me_psize; - { - size_t fsize = 0; - if ((rc = mdb_fsize(env->me_fd, &fsize))) - goto leave; - if (w2 > fsize) - w2 = fsize; - } - wsize = w2 - wsize; - while (wsize > 0) { - if (wsize > MAX_WRITE) - w2 = MAX_WRITE; - else - w2 = wsize; - len = write(fd, ptr, w2); - if (len < 0 ) { - rc = errno; - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - -leave: - mdb_txn_abort(txn); - return rc; -} - -int __cold -mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned flags) -{ - if (flags & MDB_CP_COMPACT) - return mdb_env_copyfd1(env, fd); - else - return mdb_env_copyfd0(env, fd); -} - -int __cold -mdb_env_copyfd(MDB_env *env, HANDLE fd) -{ - return mdb_env_copyfd2(env, fd, 0); -} - -int __cold -mdb_env_copy2(MDB_env *env, const char *path, unsigned flags) -{ - int rc, len; - char *lpath; - HANDLE newfd = INVALID_HANDLE_VALUE; - - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } - - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666); - if (newfd == INVALID_HANDLE_VALUE) { - rc = errno; - goto leave; - } - - int fdflags; - if ((fdflags = fcntl(newfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(newfd, F_SETFD, fdflags); - - if (env->me_psize >= env->me_os_psize) { -#ifdef F_NOCACHE /* __APPLE__ */ - (void) fcntl(newfd, F_NOCACHE, 1); -#elif defined O_DIRECT - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); -#endif - } - - rc = mdb_env_copyfd2(env, newfd, flags); - -leave: - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd != INVALID_HANDLE_VALUE) - if (close(newfd) < 0 && rc == MDB_SUCCESS) - rc = errno; - - return rc; -} - -int __cold -mdb_env_copy(MDB_env *env, const char *path) -{ - return mdb_env_copy2(env, path, 0); -} - -int __cold -mdb_env_set_flags(MDB_env *env, unsigned flags, int onoff) -{ - if (unlikely(flags & ~CHANGEABLE)) - return EINVAL; - - pthread_mutex_t *mutex = MDB_MUTEX(env, w); - int rc = mdb_mutex_lock(env, mutex); - if (unlikely(rc)) - return rc; - - if (onoff) - env->me_flags |= flags; - else - env->me_flags &= ~flags; - - mdb_mutex_unlock(env, mutex); - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_flags(MDB_env *env, unsigned *arg) -{ - if (unlikely(!env || !arg)) - return EINVAL; - - *arg = env->me_flags & (CHANGEABLE|CHANGELESS); - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_userctx(MDB_env *env, void *ctx) -{ - if (unlikely(!env)) - return EINVAL; - env->me_userctx = ctx; - return MDB_SUCCESS; -} - -void * __cold -mdb_env_get_userctx(MDB_env *env) -{ - return env ? env->me_userctx : NULL; -} - -int __cold -mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) -{ - if (unlikely(!env)) - return EINVAL; -#if MDB_DEBUG - env->me_assert_func = func; - return MDB_SUCCESS; -#else - (void) func; - return ENOSYS; -#endif -} - -int __cold -mdb_env_get_path(MDB_env *env, const char **arg) -{ - if (unlikely(!env || !arg)) - return EINVAL; - - *arg = env->me_path; - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) -{ - if (unlikely(!env || !arg)) - return EINVAL; - - *arg = env->me_fd; - return MDB_SUCCESS; -} - -/** Common code for #mdb_stat() and #mdb_env_stat(). - * @param[in] env the environment to operate in. - * @param[in] db the #MDB_db record containing the stats to return. - * @param[out] arg the address of an #MDB_stat structure to receive the stats. - * @return 0, this function always succeeds. - */ -static int __cold -mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) -{ - arg->ms_psize = env->me_psize; - arg->ms_depth = db->md_depth; - arg->ms_branch_pages = db->md_branch_pages; - arg->ms_leaf_pages = db->md_leaf_pages; - arg->ms_overflow_pages = db->md_overflow_pages; - arg->ms_entries = db->md_entries; - - return MDB_SUCCESS; -} - -MDBX_ONLY_FEATURE int __cold -mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) -{ - MDB_meta *meta; - - if (unlikely(env == NULL || arg == NULL)) - return EINVAL; - if (unlikely(bytes != sizeof(MDBX_stat))) - return EINVAL; - - meta = mdb_meta_head_r(env); - return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], &arg->base); -} - -int __cold -mdb_env_stat(MDB_env *env, MDB_stat *arg) -{ - return mdbx_env_stat(env, (MDBX_stat *) arg, sizeof(MDB_stat)); -} - -MDBX_ONLY_FEATURE int __cold -mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) -{ - MDB_meta *meta; - - if (unlikely(env == NULL || arg == NULL)) - return EINVAL; - - if (bytes == sizeof(MDB_envinfo)) { - do { - meta = mdb_meta_head_r(env); - arg->base.me_last_txnid = meta->mm_txnid; - arg->base.me_last_pgno = meta->mm_last_pg; - arg->base.me_mapaddr = meta->mm_address; - arg->base.me_mapsize = env->me_mapsize; - arg->base.me_maxreaders = env->me_maxreaders; - arg->base.me_numreaders = env->me_txns->mti_numreaders; - } while (unlikely( arg->base.me_last_txnid != env->me_txns->mti_txnid)); -#if MDBX_MODE_ENABLED - } else if (bytes == sizeof(MDBX_envinfo)) { - MDB_meta *m1, *m2; - MDB_reader *r; - unsigned i; - - m1 = METAPAGE_1(env); - m2 = METAPAGE_2(env); - - do { - meta = mdb_meta_head_r(env); - arg->base.me_last_txnid = meta->mm_txnid; - arg->base.me_last_pgno = meta->mm_last_pg; - arg->me_meta1_txnid = m1->mm_txnid; - arg->me_meta1_sign = m1->mm_datasync_sign; - arg->me_meta2_txnid = m2->mm_txnid; - arg->me_meta2_sign = m2->mm_datasync_sign; - } while (unlikely( arg->base.me_last_txnid != env->me_txns->mti_txnid - || arg->me_meta1_sign != m1->mm_datasync_sign - || arg->me_meta2_sign != m2->mm_datasync_sign )); - - arg->base.me_mapaddr = meta->mm_address; - arg->base.me_mapsize = env->me_mapsize; - arg->base.me_maxreaders = env->me_maxreaders; - arg->base.me_numreaders = env->me_txns->mti_numreaders; - arg->me_tail_txnid = 0; - - r = env->me_txns->mti_readers; - arg->me_tail_txnid = arg->base.me_last_txnid; - for (i = 0; i < arg->base.me_numreaders; ++i ) { - if (r[i].mr_pid) { - txnid_t mr = r[i].mr_txnid; - if (arg->me_tail_txnid > mr) - arg->me_tail_txnid = mr; - } - } -#endif /* MDBX_MODE_ENABLED */ - } else { - return EINVAL; - } - - return MDB_SUCCESS; -} - -int __cold -mdb_env_info(MDB_env *env, MDB_envinfo *arg) -{ - return mdbx_env_info(env, (MDBX_envinfo*) arg, sizeof(MDB_envinfo)); -} - -static MDB_cmp_func* -mdbx_default_keycmp(unsigned flags) -{ - return (flags & MDB_REVERSEKEY) ? mdb_cmp_memnr : - (flags & MDB_INTEGERKEY) ? mdb_cmp_int_a2 : mdb_cmp_memn; -} - -static MDB_cmp_func* -mdbx_default_datacmp(unsigned flags) -{ - return !(flags & MDB_DUPSORT) ? 0 : - ((flags & MDB_INTEGERDUP) ? mdb_cmp_int_ua : - ((flags & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); -} - -/** Set the default comparison functions for a database. - * Called immediately after a database is opened to set the defaults. - * The user can then override them with #mdb_set_compare() or - * #mdb_set_dupsort(). - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - */ -static void -mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) -{ - unsigned flags = txn->mt_dbs[dbi].md_flags; - txn->mt_dbxs[dbi].md_cmp = mdbx_default_keycmp(flags); - txn->mt_dbxs[dbi].md_dcmp = mdbx_default_datacmp(flags); -} - -int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi) -{ - MDB_val key, data; - MDB_dbi i; - MDB_cursor mc; - MDB_db dummy; - int rc, dbflag, exact; - unsigned unused = 0, seq; - char *namedup; - size_t len; - - if (unlikely(!txn || !dbi)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(flags & ~VALID_FLAGS)) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - /* main DB? */ - if (!name) { - *dbi = MAIN_DBI; - if (flags & PERSISTENT_FLAGS) { - uint16_t f2 = flags & PERSISTENT_FLAGS; - /* make sure flag changes get committed */ - if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { - txn->mt_dbs[MAIN_DBI].md_flags |= f2; - txn->mt_flags |= MDB_TXN_DIRTY; - } - } - mdb_default_cmp(txn, MAIN_DBI); - return MDB_SUCCESS; - } - - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, MAIN_DBI); - } - - /* Is the DB already open? */ - len = strlen(name); - for (i=CORE_DBS; imt_numdbs; i++) { - if (!txn->mt_dbxs[i].md_name.mv_size) { - /* Remember this free slot */ - if (!unused) unused = i; - continue; - } - if (len == txn->mt_dbxs[i].md_name.mv_size && - !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { - *dbi = i; - return MDB_SUCCESS; - } - } - - /* If no free slot and max hit, fail */ - if (!unused && unlikely(txn->mt_numdbs >= txn->mt_env->me_maxdbs)) - return MDB_DBS_FULL; - - /* Cannot mix named databases with some mainDB flags */ - if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY))) - return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; - - /* Find the DB info */ - dbflag = DB_NEW|DB_VALID|DB_USRVALID; - exact = 0; - key.mv_size = len; - key.mv_data = (void *)name; - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); - if (likely(rc == MDB_SUCCESS)) { - /* make sure this is actually a DB */ - MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (unlikely((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)) - return MDB_INCOMPATIBLE; - } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { - return rc; - } - - /* Done here so we cannot fail after creating a new DB */ - if (unlikely((namedup = strdup(name)) == NULL)) - return ENOMEM; - - if (unlikely(rc)) { - /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ - data.mv_size = sizeof(MDB_db); - data.mv_data = &dummy; - memset(&dummy, 0, sizeof(dummy)); - dummy.md_root = P_INVALID; - dummy.md_flags = flags & PERSISTENT_FLAGS; - WITH_CURSOR_TRACKING(mc, - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); - dbflag |= DB_DIRTY; - } - - if (unlikely(rc)) { - free(namedup); - } else { - /* Got info, register DBI in this txn */ - unsigned slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = namedup; - txn->mt_dbxs[slot].md_name.mv_size = len; - txn->mt_dbxs[slot].md_rel = NULL; - txn->mt_dbflags[slot] = dbflag; - /* txn-> and env-> are the same in read txns, use - * tmp variable to avoid undefined assignment - */ - seq = ++txn->mt_env->me_dbiseqs[slot]; - txn->mt_dbiseqs[slot] = seq; - - memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); - *dbi = slot; - mdb_default_cmp(txn, slot); - if (!unused) { - txn->mt_numdbs++; - } - } - - return rc; -} - -MDBX_ONLY_FEATURE int __cold -mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) -{ - if (unlikely(!arg || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; - - if (unlikely(bytes != sizeof(MDBX_stat))) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { - MDB_cursor mc; - MDB_xcursor mx; - /* Stale, must read the DB's root. cursor_init does it for us. */ - mdb_cursor_init(&mc, txn, dbi, &mx); - } - return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], &arg->base); -} - -int __cold -mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) -{ - return mdbx_stat(txn, dbi, (MDBX_stat*) arg, sizeof(MDB_stat)); -} - -void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) -{ - char *ptr; - if (dbi < CORE_DBS || dbi >= env->me_maxdbs) - return; - ptr = env->me_dbxs[dbi].md_name.mv_data; - /* If there was no name, this was already closed */ - if (ptr) { - env->me_dbxs[dbi].md_name.mv_data = NULL; - env->me_dbxs[dbi].md_name.mv_size = 0; - env->me_dbflags[dbi] = 0; - env->me_dbiseqs[dbi]++; - free(ptr); - } -} - -int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) -{ - if (unlikely(!txn || !flags)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; - - *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; - return MDB_SUCCESS; -} - -/** Add all the DB's pages to the free list. - * @param[in] mc Cursor on the DB to free. - * @param[in] subs non-Zero to check for sub-DBs in this DB. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_drop0(MDB_cursor *mc, int subs) -{ - int rc; - - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (likely(rc == MDB_SUCCESS)) { - MDB_txn *txn = mc->mc_txn; - MDB_node *ni; - MDB_cursor mx; - unsigned i; - - /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. - * This also avoids any P_LEAF2 pages, which have no nodes. - * Also if the DB doesn't have sub-DBs and has no overflow - * pages, omit scanning leaves. - */ - if ((mc->mc_flags & C_SUB) || - (!subs && !mc->mc_db->md_overflow_pages)) - mdb_cursor_pop(mc); - - mdb_cursor_copy(mc, &mx); - while (mc->mc_snum > 0) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - unsigned n = NUMKEYS(mp); - if (IS_LEAF(mp)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(mc, pg, &omp, NULL); - if (unlikely(rc)) - goto done; - mdb_cassert(mc, IS_OVERFLOW(omp)); - rc = mdb_midl_append_range(&txn->mt_free_pgs, - pg, omp->mp_pages); - if (unlikely(rc)) - goto done; - mc->mc_db->md_overflow_pages -= omp->mp_pages; - if (!mc->mc_db->md_overflow_pages && !subs) - break; - } else if (subs && (ni->mn_flags & F_SUBDATA)) { - mdb_xcursor_init1(mc, ni); - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (unlikely(rc)) - goto done; - } - } - if (!subs && !mc->mc_db->md_overflow_pages) - goto pop; - } else { - if (unlikely((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0)) - goto done; - for (i=0; imt_free_pgs, pg); - } - } - if (!mc->mc_top) - break; - mc->mc_ki[mc->mc_top] = i; - rc = mdb_cursor_sibling(mc, 1); - if (rc) { - if (unlikely(rc != MDB_NOTFOUND)) - goto done; - /* no more siblings, go back to beginning - * of previous level. - */ -pop: - mdb_cursor_pop(mc); - mc->mc_ki[0] = 0; - for (i=1; imc_snum; i++) { - mc->mc_ki[i] = 0; - mc->mc_pg[i] = mx.mc_pg[i]; - } - } - } - /* free it */ - rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); -done: - if (unlikely(rc)) - txn->mt_flags |= MDB_TXN_ERROR; - } else if (rc == MDB_NOTFOUND) { - rc = MDB_SUCCESS; - } - mc->mc_flags &= ~C_INITIALIZED; - return rc; -} - -int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) -{ - MDB_cursor *mc, *m2; - int rc; - - if (unlikely(1 < (unsigned) del || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(TXN_DBI_CHANGED(txn, dbi))) - return MDB_BAD_DBI; - - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EACCES; - - rc = mdb_cursor_open(txn, dbi, &mc); - if (unlikely(rc)) - return rc; - - rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); - /* Invalidate the dropped DB's cursors */ - for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) - m2->mc_flags &= ~(C_INITIALIZED|C_EOF); - if (unlikely(rc)) - goto leave; - - /* Can't delete the main DB */ - if (del && dbi >= CORE_DBS) { - rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); - if (likely(!rc)) { - txn->mt_dbflags[dbi] = DB_STALE; - mdb_dbi_close(txn->mt_env, dbi); - } else { - txn->mt_flags |= MDB_TXN_ERROR; - } - } else { - /* reset the DB record, mark it dirty */ - txn->mt_dbflags[dbi] |= DB_DIRTY; - txn->mt_dbs[dbi].md_depth = 0; - txn->mt_dbs[dbi].md_branch_pages = 0; - txn->mt_dbs[dbi].md_leaf_pages = 0; - txn->mt_dbs[dbi].md_overflow_pages = 0; - txn->mt_dbs[dbi].md_entries = 0; - txn->mt_dbs[dbi].md_root = P_INVALID; - - txn->mt_flags |= MDB_TXN_DIRTY; - } -leave: - mdb_cursor_close(mc); - return rc; -} - -int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_cmp = cmp; - return MDB_SUCCESS; -} - -int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_dcmp = cmp; - return MDB_SUCCESS; -} - -int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_rel = rel; - return MDB_SUCCESS; -} - -int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_relctx = ctx; - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_maxkeysize(MDB_env *env) -{ - if (!env || env->me_signature != MDBX_ME_SIGNATURE) - return EINVAL; - return ENV_MAXKEY(env); -} - -int __cold -mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) -{ - unsigned i, rdrs; - MDB_reader *mr; - char buf[64]; - int rc = 0, first = 1; - - if (unlikely(!env || !func)) - return -EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - rdrs = env->me_txns->mti_numreaders; - mr = env->me_txns->mti_readers; - for (i=0; i> 1; - cursor = base + pivot + 1; - val = pid - ids[cursor]; - - if( val < 0 ) { - n = pivot; - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - } else { - /* found, so it's a duplicate */ - return -1; - } - } - - if( val > 0 ) { - ++cursor; - } - ids[0]++; - for (n = ids[0]; n > cursor; n--) - ids[n] = ids[n-1]; - ids[n] = pid; - return 0; -} - -int __cold -mdb_reader_check(MDB_env *env, int *dead) -{ - if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) - return EINVAL; - if (dead) - *dead = 0; - return mdb_reader_check0(env, 0, dead); -} - -/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ -static int __cold -mdb_reader_check0(MDB_env *env, int rlocked, int *dead) -{ - pthread_mutex_t *rmutex = rlocked ? NULL : MDB_MUTEX(env, r); - unsigned i, j, rdrs; - MDB_reader *mr; - pid_t *pids, pid; - int rc = MDB_SUCCESS, count = 0; - - if (unlikely(env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - rdrs = env->me_txns->mti_numreaders; - pids = malloc((rdrs+1) * sizeof(pid_t)); - if (!pids) - return ENOMEM; - pids[0] = 0; - mr = env->me_txns->mti_readers; - for (i=0; ime_pid) { - if (mdb_pid_insert(pids, pid) == 0) { - if (!mdb_reader_pid(env, F_GETLK, pid)) { - /* Stale reader found */ - j = i; - if (rmutex) { - if ((rc = pthread_mutex_lock(rmutex)) != 0) { - if ((rc = mdb_mutex_failed(env, rmutex, rc))) - break; - rdrs = 0; /* the above checked all readers */ - } else { - /* Recheck, a new process may have reused pid */ - if (mdb_reader_pid(env, F_GETLK, pid)) - j = rdrs; - } - } - for (; j < rdrs; j++) { - if (mr[j].mr_pid == pid) { - mdb_debug("clear stale reader pid %u txn %zd", - (unsigned) pid, mr[j].mr_txnid); - mr[j].mr_pid = 0; - count++; - } - } - if (rmutex) - mdb_mutex_unlock(env, rmutex); - } - } - } - } - free(pids); - if (dead) - *dead = count; - return rc; -} - -static int __cold -mdb_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc) -{ -#if MDB_USE_ROBUST - if (unlikely(rc == EOWNERDEAD)) { - int rlocked, rc2; - - /* We own the mutex. Clean up after dead previous owner. */ - rc = MDB_SUCCESS; - rlocked = (mutex == MDB_MUTEX(env, r)); - if (!rlocked) { - /* Keep mti_txnid updated, otherwise next writer can - * overwrite data which latest meta page refers to. - */ - #if 0 - /* LY: Hm, how this can happen, if the mti_txnid - * is updating only at the finish of a successful commit ? */ - - MDB_meta *meta = mdb_env_meta_head(env); - env->me_txns->mti_txnid = meta->mm_txnid; - #endif - /* env is hosed if the dead thread was ours */ - if (env->me_txn) { - env->me_flags |= MDB_FATAL_ERROR; - env->me_txn = NULL; - rc = MDB_PANIC; - } - } - mdb_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - rc2 = mdb_reader_check0(env, rlocked, NULL); - if (rc2 == 0) - rc2 = pthread_mutex_consistent(mutex); - if (rc || (rc = rc2)) { - mdb_debug("mutex recovery failed, %s", mdb_strerror(rc)); - pthread_mutex_unlock(mutex); - } - } -#endif /* MDB_USE_ROBUST */ - if (unlikely(rc)) { - mdb_debug("lock mutex failed, %s", mdb_strerror(rc)); - if (rc != EDEADLK) { - env->me_flags |= MDB_FATAL_ERROR; - rc = MDB_PANIC; - } - } - - return rc; -} - -static int mdb_mutex_lock(MDB_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_lock(mutex); - if (unlikely(rc)) - rc = mdb_mutex_failed(env, mutex, rc); - return rc; -} - -static void mdb_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_unlock(mutex); - mdb_assert(env, rc == 0); - (void) env; - (void) rc; -} - -/** @} */ - -#include "./midl.c" diff --git a/mdb_chk.c b/mdb_chk.c deleted file mode 100644 index db141b4b..00000000 --- a/mdb_chk.c +++ /dev/null @@ -1,954 +0,0 @@ -/* mdbx_chk.c - memory-mapped database check tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * - * This file is part of libmdbx. - * - * libmdbx is free software; you can redistribute it and/or modify it under - * the terms of the GNU Affero General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * libmdbx is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "midl.h" -#include "mdbx.h" - -typedef struct flagbit { - int bit; - char *name; -} flagbit; - -flagbit dbflags[] = { - { MDB_DUPSORT, "dupsort" }, - { MDB_INTEGERKEY, "integerkey" }, - { MDB_REVERSEKEY, "reversekey" }, - { MDB_DUPFIXED, "dupfixed" }, - { MDB_REVERSEDUP, "reversedup" }, - { MDB_INTEGERDUP, "integerdup" }, - { 0, NULL } -}; - -static volatile sig_atomic_t gotsignal; - -static void signal_handler( int sig ) { - (void) sig; - gotsignal = 1; -} - -#define MAX_DBI 32768 - -#define EXIT_INTERRUPTED (EXIT_FAILURE+4) -#define EXIT_FAILURE_SYS (EXIT_FAILURE+3) -#define EXIT_FAILURE_MDB (EXIT_FAILURE+2) -#define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE+1) -#define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE - -struct { - const char* dbi_names[MAX_DBI]; - size_t dbi_pages[MAX_DBI]; - size_t dbi_empty_pages[MAX_DBI]; - size_t dbi_payload_bytes[MAX_DBI]; - size_t dbi_lost_bytes[MAX_DBI]; - short *pagemap; - size_t total_payload_bytes; - size_t pgcount; -} walk; - -static __attribute__((constructor)) -void init_walk(void) -{ - walk.dbi_names[0] = "@gc"; -} - -size_t total_unused_bytes; -int exclusive = 2; - -MDB_env *env; -MDB_txn *txn, *locktxn; -MDBX_envinfo info; -MDBX_stat stat; -size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno; -size_t userdb_count, skipped_subdb; -unsigned verbose, quiet; -const char* only_subdb; - -struct problem { - struct problem* pr_next; - size_t count; - const char* caption; -}; - -struct problem* problems_list; -size_t total_problems; - -static void __attribute__ ((format (printf, 1, 2))) -print(const char* msg, ...) { - if (! quiet) { - va_list args; - - fflush(stderr); - va_start(args, msg); - vfprintf(stdout, msg, args); - va_end(args); - } -} - -static void __attribute__ ((format (printf, 1, 2))) -error(const char* msg, ...) { - total_problems++; - - if (! quiet) { - va_list args; - - fflush(stdout); - va_start(args, msg); - vfprintf(stderr, msg, args); - va_end(args); - fflush(NULL); - } -} - -static void pagemap_cleanup(void) { - int i; - - for( i = 1; i < MAX_DBI; ++i ) { - if (walk.dbi_names[i]) { - free((void *) walk.dbi_names[i]); - walk.dbi_names[i] = NULL; - } - } - - free(walk.pagemap); - walk.pagemap = NULL; -} - -static int pagemap_lookup_dbi(const char* dbi) { - static int last; - int i; - - if (last > 0 && strcmp(walk.dbi_names[last], dbi) == 0) - return last; - - for(i = 1; walk.dbi_names[i] && last < MAX_DBI; ++i) - if (strcmp(walk.dbi_names[i], dbi) == 0) - return last = i; - - if (i == MAX_DBI) - return -1; - - walk.dbi_names[i] = strdup(dbi); - - if (verbose > 1) { - print(" - found '%s' area\n", dbi); - fflush(NULL); - } - - return last = i; -} - -static void problem_add(const char* object, size_t entry_number, const char* msg, const char *extra, ...) { - total_problems++; - - if (! quiet) { - int need_fflush = 0; - struct problem* p; - - for (p = problems_list; p; p = p->pr_next) - if (p->caption == msg) - break; - - if (! p) { - p = calloc(1, sizeof(*p)); - p->caption = msg; - p->pr_next = problems_list; - problems_list = p; - need_fflush = 1; - } - - p->count++; - if (verbose > 1) { - print(" %s #%zu: %s", object, entry_number, msg); - if (extra) { - va_list args; - printf(" ("); - va_start(args, extra); - vfprintf(stdout, extra, args); - va_end(args); - printf(")"); - } - printf("\n"); - if (need_fflush) - fflush(NULL); - } - } -} - -static struct problem* problems_push() { - struct problem* p = problems_list; - problems_list = NULL; - return p; -} - -static size_t problems_pop(struct problem* list) { - size_t count = 0; - - if (problems_list) { - int i; - - print(" - problems: "); - for (i = 0; problems_list; ++i) { - struct problem* p = problems_list->pr_next; - count += problems_list->count; - print("%s%s (%zu)", i ? ", " : "", problems_list->caption, problems_list->count); - free(problems_list); - problems_list = p; - } - print("\n"); - fflush(NULL); - } - - problems_list = list; - return count; -} - -static int pgvisitor(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi, - const char* type, int nentries, int payload_bytes, int header_bytes, int unused_bytes) -{ - (void) ctx; - - if (type) { - size_t page_bytes = payload_bytes + header_bytes + unused_bytes; - size_t page_size = pgnumber * stat.base.ms_psize; - int index = pagemap_lookup_dbi(dbi); - if (index < 0) - return ENOMEM; - - if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { - if (pgnumber == 1) - print(" %s-page %zu", type, pgno); - else - print(" %s-span %zu[%u]", type, pgno, pgnumber); - print(" of %s: header %i, payload %i, unused %i\n", - dbi, header_bytes, payload_bytes, unused_bytes); - } - - walk.pgcount += pgnumber; - - if (unused_bytes < 0 || (size_t) unused_bytes > page_size) - problem_add("page", pgno, "illegal unused-bytes", "%zu < %i < %zu", - 0, unused_bytes, stat.base.ms_psize); - - if (header_bytes < (int) sizeof(long) || (size_t) header_bytes >= stat.base.ms_psize - sizeof(long)) - problem_add("page", pgno, "illegal header-length", "%zu < %i < %zu", - sizeof(long), header_bytes, stat.base.ms_psize - sizeof(long)); - if (payload_bytes < 1) { - if (nentries > 1) { - problem_add("page", pgno, "zero size-of-entry", "payload %i bytes, %i entries", - payload_bytes, nentries); - if ((size_t) header_bytes + unused_bytes < page_size) { - /* LY: hush a misuse error */ - page_bytes = page_size; - } - } else { - problem_add("page", pgno, "empty", "payload %i bytes, %i entries", - payload_bytes, nentries); - walk.dbi_empty_pages[index] += 1; - } - } - - if (page_bytes != page_size) { - problem_add("page", pgno, "misused", "%zu != %zu (%ih + %ip + %iu)", - page_size, page_bytes, header_bytes, payload_bytes, unused_bytes); - if (page_size > page_bytes) - walk.dbi_lost_bytes[index] += page_size - page_bytes; - } else { - walk.dbi_payload_bytes[index] += payload_bytes + header_bytes; - walk.total_payload_bytes += payload_bytes + header_bytes; - } - - if (pgnumber) { - do { - if (pgno >= lastpgno) - problem_add("page", pgno, "wrong page-no", - "%zu > %zi", pgno, lastpgno); - else if (walk.pagemap[pgno]) - problem_add("page", pgno, "already used", - "in %s", walk.dbi_names[walk.pagemap[pgno]]); - else { - walk.pagemap[pgno] = index; - walk.dbi_pages[index] += 1; - } - ++pgno; - } while(--pgnumber); - } - } - - return gotsignal ? EINTR : MDB_SUCCESS; -} - -typedef int (visitor)(size_t record_number, MDB_val *key, MDB_val* data); -static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); - -static int handle_userdb(size_t record_number, MDB_val *key, MDB_val* data) { - (void) record_number; - (void) key; - (void) data; - return MDB_SUCCESS; -} - -static int handle_freedb(size_t record_number, MDB_val *key, MDB_val* data) { - char *bad = ""; - size_t pg, prev; - ssize_t i, number, span = 0; - size_t *iptr = data->mv_data, txnid = *(size_t*)key->mv_data; - - if (key->mv_size != sizeof(txnid)) - problem_add("entry", record_number, "wrong txn-id size", "key-size %zi", key->mv_size); - else if (txnid < 1 || txnid > info.base.me_last_txnid) - problem_add("entry", record_number, "wrong txn-id", "%zu", txnid); - - if (data->mv_size < sizeof(size_t) || data->mv_size % sizeof(size_t)) - problem_add("entry", record_number, "wrong idl size", "%zu", data->mv_size); - else { - number = *iptr++; - if (number >= MDB_IDL_UM_MAX) - problem_add("entry", record_number, "wrong idl length", "%zi", number); - else if ((number + 1) * sizeof(size_t) != data->mv_size) - problem_add("entry", record_number, "mismatch idl length", "%zi != %zu", - number * sizeof(size_t), data->mv_size); - else { - freedb_pages += number; - if (info.me_tail_txnid > txnid) - reclaimable_pages += number; - for (i = number, prev = 1; --i >= 0; ) { - pg = iptr[i]; - if (pg < 2 /* META_PAGE */ || pg > info.base.me_last_pgno) - problem_add("entry", record_number, "wrong idl entry", "2 < %zi < %zi", - pg, info.base.me_last_pgno); - else if (pg <= prev) { - bad = " [bad sequence]"; - problem_add("entry", record_number, "bad sequence", "%zi <= %zi", - pg, prev); - } - prev = pg; - pg += span; - for (; i >= span && iptr[i - span] == pg; span++, pg++) ; - } - if (verbose > 2 && !only_subdb) { - print(" transaction %zu, %zd pages, maxspan %zd%s\n", - *(size_t *)key->mv_data, number, span, bad); - if (verbose > 3) { - int j = number - 1; - while (j >= 0) { - pg = iptr[j]; - for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) ; - if (span > 1) - print(" %9zu[%zd]\n", pg, span); - else - print(" %9zu\n", pg); - } - } - } - } - } - - return MDB_SUCCESS; -} - -static int handle_maindb(size_t record_number, MDB_val *key, MDB_val* data) { - char *name; - int rc; - size_t i; - - name = key->mv_data; - for(i = 0; i < key->mv_size; ++i) { - if (name[i] < ' ') - return handle_userdb(record_number, key, data); - } - - name = malloc(key->mv_size + 1); - memcpy(name, key->mv_data, key->mv_size); - name[key->mv_size] = '\0'; - userdb_count++; - - rc = process_db(-1, name, handle_userdb, 0); - free(name); - if (rc != MDB_INCOMPATIBLE) - return rc; - - return handle_userdb(record_number, key, data); -} - -static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) -{ - MDB_cursor *mc; - MDBX_stat ms; - MDB_val key, data; - MDB_val prev_key, prev_data; - unsigned flags; - int rc, i; - struct problem* saved_list; - size_t problems_count; - - unsigned record_count = 0, dups = 0; - size_t key_bytes = 0, data_bytes = 0; - - if (0 > (int) dbi) { - rc = mdbx_dbi_open(txn, name, 0, &dbi); - if (rc) { - if (!name || rc != MDB_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { - error(" - mdbx_open '%s' failed, error %d %s\n", - name ? name : "main", rc, mdbx_strerror(rc)); - } - return rc; - } - } - - if (dbi >= 2 /* CORE_DBS */ && name && only_subdb && strcmp(only_subdb, name)) { - if (verbose) { - print("Skip processing '%s'...\n", name); - fflush(NULL); - } - skipped_subdb++; - return MDB_SUCCESS; - } - - if (! silent && verbose) { - print("Processing '%s'...\n", name ? name : "main"); - fflush(NULL); - } - - rc = mdbx_dbi_flags(txn, dbi, &flags); - if (rc) { - error(" - mdbx_dbi_flags failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); - if (rc) { - error(" - mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - if (! silent && verbose) { - print(" - dbi-id %d, flags:", dbi); - if (! flags) - print(" none"); - else { - for (i=0; dbflags[i].bit; i++) - if (flags & dbflags[i].bit) - print(" %s", dbflags[i].name); - } - print(" (0x%02X)\n", flags); - if (verbose > 1) { - print(" - page size %u, entries %zu\n", ms.base.ms_psize, ms.base.ms_entries); - print(" - b-tree depth %u, pages: branch %zu, leaf %zu, overflow %zu\n", - ms.base.ms_depth, ms.base.ms_branch_pages, ms.base.ms_leaf_pages, ms.base.ms_overflow_pages); - } - } - - rc = mdbx_cursor_open(txn, dbi, &mc); - if (rc) { - error(" - mdbx_cursor_open failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - saved_list = problems_push(); - prev_key.mv_data = NULL; - prev_data.mv_size = 0; - rc = mdbx_cursor_get(mc, &key, &data, MDB_FIRST); - while (rc == MDB_SUCCESS) { - if (gotsignal) { - print(" - interrupted by signal\n"); - fflush(NULL); - rc = EINTR; - goto bailout; - } - - if (key.mv_size > maxkeysize) { - problem_add("entry", record_count, "key length exceeds max-key-size", - "%zu > %zu", key.mv_size, maxkeysize); - } else if ((flags & MDB_INTEGERKEY) - && key.mv_size != sizeof(size_t) && key.mv_size != sizeof(int)) { - problem_add("entry", record_count, "wrong key length", - "%zu != %zu", key.mv_size, sizeof(size_t)); - } - - if ((flags & MDB_INTEGERDUP) - && data.mv_size != sizeof(size_t) && data.mv_size != sizeof(int)) { - problem_add("entry", record_count, "wrong data length", - "%zu != %zu", data.mv_size, sizeof(size_t)); - } - - if (prev_key.mv_data) { - if ((flags & MDB_DUPFIXED) && prev_data.mv_size != data.mv_size) { - problem_add("entry", record_count, "different data length", - "%zu != %zu", prev_data.mv_size, data.mv_size); - } - - int cmp = mdbx_cmp(txn, dbi, &prev_key, &key); - if (cmp > 0) { - problem_add("entry", record_count, "broken ordering of entries", NULL); - } else if (cmp == 0) { - ++dups; - if (! (flags & MDB_DUPSORT)) - problem_add("entry", record_count, "duplicated entries", NULL); - else if (flags & MDB_INTEGERDUP) { - cmp = mdbx_dcmp(txn, dbi, &prev_data, &data); - if (cmp > 0) - problem_add("entry", record_count, "broken ordering of multi-values", NULL); - } - } - } else if (verbose) { - if (flags & MDB_INTEGERKEY) - print(" - fixed key-size %zu\n", key.mv_size ); - if (flags & (MDB_INTEGERDUP | MDB_DUPFIXED)) - print(" - fixed data-size %zu\n", data.mv_size ); - } - - if (handler) { - rc = handler(record_count, &key, &data); - if (rc) - goto bailout; - } - - record_count++; - key_bytes += key.mv_size; - data_bytes += data.mv_size; - - prev_key = key; - prev_data = data; - rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT); - } - if (rc != MDB_NOTFOUND) - error(" - mdbx_cursor_get failed, error %d %s\n", rc, mdbx_strerror(rc)); - else - rc = 0; - - if (record_count != ms.base.ms_entries) - problem_add("entry", record_count, "differentent number of entries", - "%zu != %zu", record_count, ms.base.ms_entries); -bailout: - problems_count = problems_pop(saved_list); - if (! silent && verbose) { - print(" - summary: %u records, %u dups, %zu key's bytes, %zu data's bytes, %zu problems\n", - record_count, dups, key_bytes, data_bytes, problems_count); - fflush(NULL); - } - - mdbx_cursor_close(mc); - return rc || problems_count; -} - -static void usage(char *prog) -{ - fprintf(stderr, "usage: %s dbpath [-V] [-v] [-n] [-q] [-w] [-c] [-d] [-s subdb]\n" - " -V\t\tshow version\n" - " -v\t\tmore verbose, could be used multiple times\n" - " -n\t\tNOSUBDIR mode for open\n" - " -q\t\tbe quiet\n" - " -w\t\tlock DB for writing while checking\n" - " -d\t\tdisable page-by-page traversal of b-tree\n" - " -s subdb\tprocess a specific subdatabase only\n" - " -c\t\tforce cooperative mode (don't try exclusive)\n", prog); - exit(EXIT_INTERRUPTED); -} - -const char* meta_synctype(size_t sign) { - switch(sign) { - case 0: - return "no-sync/legacy"; - case 1: - return "weak"; - default: - return "steady"; - } -} - -int meta_lt(size_t txn1, size_t sign1, size_t txn2, size_t sign2) { - return ((sign1 > 1) == (sign2 > 1)) ? txn1 < txn2 : txn2 && sign2 > 1; -} - -int main(int argc, char *argv[]) -{ - int i, rc; - char *prog = argv[0]; - char *envname; - int envflags = MDB_RDONLY; - int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; - int dont_traversal = 0; - size_t n; - struct timespec timestamp_start, timestamp_finish; - double elapsed; - - atexit(pagemap_cleanup); - - if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { - rc = errno; - error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); - return EXIT_FAILURE_SYS; - } - - if (argc < 2) { - usage(prog); - } - - while ((i = getopt(argc, argv, "Vvqnwcds:")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(EXIT_SUCCESS); - break; - case 'v': - verbose++; - break; - case 'q': - quiet = 1; - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 'w': - envflags &= ~MDB_RDONLY; - break; - case 'c': - exclusive = 0; - break; - case 'd': - dont_traversal = 1; - break; - case 's': - if (only_subdb && strcmp(only_subdb, optarg)) - usage(prog); - only_subdb = optarg; - break; - default: - usage(prog); - } - } - - if (optind != argc - 1) - usage(prog); - -#ifdef SIGPIPE - signal(SIGPIPE, signal_handler); -#endif -#ifdef SIGHUP - signal(SIGHUP, signal_handler); -#endif - signal(SIGINT, signal_handler); - signal(SIGTERM, signal_handler); - - envname = argv[optind]; - print("Running mdbx_chk for '%s' in %s mode...\n", - envname, (envflags & MDB_RDONLY) ? "read-only" : "write-lock"); - fflush(NULL); - - rc = mdbx_env_create(&env); - if (rc) { - error("mdbx_env_create failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc < 0 ? EXIT_FAILURE_MDB : EXIT_FAILURE_SYS; - } - - rc = mdbx_env_get_maxkeysize(env); - if (rc < 0) { - error("mdbx_env_get_maxkeysize failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - maxkeysize = rc; - - rc = mdbx_env_set_maxdbs(env, MAX_DBI); - if (rc < 0) { - error("mdbx_env_set_maxdbs failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - rc = mdbx_env_open_ex(env, envname, envflags, 0664, &exclusive); - if (rc) { - error("mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - if (verbose) - print(" - %s mode\n", exclusive ? "monopolistic" : "cooperative"); - - if (! (envflags & MDB_RDONLY)) { - rc = mdbx_txn_begin(env, NULL, 0, &locktxn); - if (rc) { - error("mdbx_txn_begin(lock-write) failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - } - - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) { - error("mdbx_txn_begin(read-only) failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - rc = mdbx_env_info(env, &info, sizeof(info)); - if (rc) { - error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - rc = mdbx_env_stat(env, &stat, sizeof(stat)); - if (rc) { - error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - lastpgno = info.base.me_last_pgno + 1; - errno = 0; - - if (verbose) { - double k = 1024.0; - const char sf[] = "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ - for(i = 0; sf[i+1] && info.base.me_mapsize / k > 1000.0; ++i) - k *= 1024; - print(" - map size %zu (%.2f %cb)\n", info.base.me_mapsize, - info.base.me_mapsize / k, sf[i]); - if (info.base.me_mapaddr) - print(" - mapaddr %p\n", info.base.me_mapaddr); - print(" - pagesize %u, max keysize %zu (%s), max readers %u\n", - stat.base.ms_psize, maxkeysize, - (maxkeysize == 511) ? "default" : - (maxkeysize == 0) ? "devel" : "custom", - info.base.me_maxreaders); - print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", info.base.me_last_txnid, - info.me_tail_txnid, info.base.me_last_txnid - info.me_tail_txnid); - - print(" - meta-1: %s %zu, %s", - meta_synctype(info.me_meta1_sign), info.me_meta1_txnid, - meta_lt(info.me_meta1_txnid, info.me_meta1_sign, - info.me_meta2_txnid, info.me_meta2_sign) ? "tail" : "head"); - if (info.me_meta1_txnid > info.base.me_last_txnid) - print(", rolled-back %zu (%zu >>> %zu)", - info.me_meta1_txnid - info.base.me_last_txnid, - info.me_meta1_txnid, info.base.me_last_txnid); - print("\n"); - - print(" - meta-2: %s %zu, %s", - meta_synctype(info.me_meta2_sign), info.me_meta2_txnid, - meta_lt(info.me_meta2_txnid, info.me_meta2_sign, - info.me_meta1_txnid, info.me_meta1_sign) ? "tail" : "head"); - if (info.me_meta2_txnid > info.base.me_last_txnid) - print(", rolled-back %zu (%zu >>> %zu)", - info.me_meta2_txnid - info.base.me_last_txnid, - info.me_meta2_txnid, info.base.me_last_txnid); - print("\n"); - } - - if (exclusive > 1) { - if (verbose) - print(" - perform full check last-txn-id with meta-pages\n"); - - if (! meta_lt(info.me_meta1_txnid, info.me_meta1_sign, - info.me_meta2_txnid, info.me_meta2_sign) - && info.me_meta1_txnid != info.base.me_last_txnid) { - print(" - meta-1 txn-id mismatch last-txn-id (%zi != %zi)\n", - info.me_meta1_txnid, info.base.me_last_txnid); - ++problems_meta; - } - - if (! meta_lt(info.me_meta2_txnid, info.me_meta2_sign, - info.me_meta1_txnid, info.me_meta1_sign) - && info.me_meta2_txnid != info.base.me_last_txnid) { - print(" - meta-2 txn-id mismatch last-txn-id (%zi != %zi)\n", - info.me_meta2_txnid, info.base.me_last_txnid); - ++problems_meta; - } - } else if (locktxn) { - if (verbose) - print(" - perform lite check last-txn-id with meta-pages (not a monopolistic mode)\n"); - size_t last = (info.me_meta2_txnid > info.me_meta1_txnid) ? info.me_meta2_txnid : info.me_meta1_txnid; - if (last != info.base.me_last_txnid) { - print(" - last-meta mismatch last-txn-id (%zi != %zi)\n", - last, info.base.me_last_txnid); - ++problems_meta; - } - } else if (verbose) { - print(" - skip check last-txn-id with meta-pages (monopolistic or write-lock mode only)\n"); - } - - if (!dont_traversal) { - struct problem* saved_list; - size_t traversal_problems; - size_t empty_pages, lost_bytes; - - print("Traversal b-tree...\n"); - fflush(NULL); - walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap)); - if (! walk.pagemap) { - rc = errno ? errno : ENOMEM; - error("calloc failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - saved_list = problems_push(); - rc = mdbx_env_pgwalk(txn, pgvisitor, NULL); - traversal_problems = problems_pop(saved_list); - - if (rc) { - if (rc == EINTR && gotsignal) { - print(" - interrupted by signal\n"); - fflush(NULL); - } else { - error("mdbx_env_pgwalk failed, error %d %s\n", rc, mdbx_strerror(rc)); - } - goto bailout; - } - - for( n = 0; n < lastpgno; ++n) - if (! walk.pagemap[n]) - walk.dbi_pages[0] += 1; - - empty_pages = lost_bytes = 0; - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - empty_pages += walk.dbi_empty_pages[i]; - lost_bytes += walk.dbi_lost_bytes[i]; - } - - if (verbose) { - size_t total_page_bytes = walk.pgcount * stat.base.ms_psize; - print(" - dbi pages: %zu total", walk.pgcount); - if (verbose > 1) - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) - print(", %s %zu", walk.dbi_names[i], walk.dbi_pages[i]); - print(", %s %zu\n", walk.dbi_names[0], walk.dbi_pages[0]); - if (verbose > 1) { - print(" - space info: total %zu bytes, payload %zu (%.1f%%), unused %zu (%.1f%%)\n", - total_page_bytes, walk.total_payload_bytes, - walk.total_payload_bytes * 100.0 / total_page_bytes, - total_page_bytes - walk.total_payload_bytes, - (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - size_t dbi_bytes = walk.dbi_pages[i] * stat.base.ms_psize; - print(" %s: subtotal %zu bytes (%.1f%%), payload %zu (%.1f%%), unused %zu (%.1f%%)", - walk.dbi_names[i], - dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, - walk.dbi_payload_bytes[i], walk.dbi_payload_bytes[i] * 100.0 / dbi_bytes, - dbi_bytes - walk.dbi_payload_bytes[i], - (dbi_bytes - walk.dbi_payload_bytes[i]) * 100.0 / dbi_bytes); - if (walk.dbi_empty_pages[i]) - print(", %zu empty pages", walk.dbi_empty_pages[i]); - if (walk.dbi_lost_bytes[i]) - print(", %zu bytes lost", walk.dbi_lost_bytes[i]); - print("\n"); - } - } - print(" - summary: average fill %.1f%%", walk.total_payload_bytes * 100.0 / total_page_bytes); - if (empty_pages) - print(", %zu empty pages", empty_pages); - if (lost_bytes) - print(", %zu bytes lost", lost_bytes); - print(", %zu problems\n", traversal_problems); - } - } else if (verbose) { - print("Skipping b-tree walk...\n"); - fflush(NULL); - } - - if (! verbose) - print("Iterating DBIs...\n"); - problems_maindb = process_db(-1, /* MAIN_DBI */ NULL, NULL, 0); - problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); - - if (verbose) { - size_t value = info.base.me_mapsize / stat.base.ms_psize; - double percent = value / 100.0; - print(" - pages info: %zu total", value); - print(", allocated %zu (%.1f%%)", lastpgno, lastpgno / percent); - - if (verbose > 1) { - value = info.base.me_mapsize / stat.base.ms_psize - lastpgno; - print(", remained %zu (%.1f%%)", value, value / percent); - - value = lastpgno - freedb_pages; - print(", used %zu (%.1f%%)", value, value / percent); - - print(", gc %zu (%.1f%%)", freedb_pages, freedb_pages / percent); - - value = freedb_pages - reclaimable_pages; - print(", detained %zu (%.1f%%)", value, value / percent); - - print(", reclaimable %zu (%.1f%%)", reclaimable_pages, reclaimable_pages / percent); - } - - value = info.base.me_mapsize / stat.base.ms_psize - lastpgno + reclaimable_pages; - print(", available %zu (%.1f%%)\n", value, value / percent); - } - - if (problems_maindb == 0 && problems_freedb == 0) { - if (!dont_traversal && (exclusive || locktxn)) { - if (walk.pgcount != lastpgno - freedb_pages) { - error("used pages mismatch (%zu != %zu)\n", walk.pgcount, lastpgno - freedb_pages); - } - if (walk.dbi_pages[0] != freedb_pages) { - error("gc pages mismatch (%zu != %zu)\n", walk.dbi_pages[0], freedb_pages); - } - } else if (verbose) { - print(" - skip check used and gc pages (btree-traversal with monopolistic or write-lock mode only)\n"); - } - - if (! process_db(-1, NULL, handle_maindb, 1)) { - if (! userdb_count && verbose) - print(" - does not contain multiple databases\n"); - } - } - -bailout: - if (txn) - mdbx_txn_abort(txn); - if (locktxn) - mdbx_txn_abort(locktxn); - if (env) - mdbx_env_close(env); - fflush(NULL); - if (rc) { - if (rc < 0) - return gotsignal ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; - return EXIT_FAILURE_MDB; - } - - if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { - rc = errno; - error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); - return EXIT_FAILURE_SYS; - } - - elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec - + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; - - total_problems += problems_meta; - if (total_problems || problems_maindb || problems_freedb) { - print("Total %zu error(s) is detected, elapsed %.3f seconds.\n", - total_problems, elapsed); - if (problems_meta || problems_maindb || problems_freedb) - return EXIT_FAILURE_CHECK_MAJOR; - return EXIT_FAILURE_CHECK_MINOR; - } - print("No error is detected, elapsed %.3f seconds\n", elapsed); - return EXIT_SUCCESS; -} diff --git a/mdb_copy.c b/mdb_copy.c deleted file mode 100644 index 43bee869..00000000 --- a/mdb_copy.c +++ /dev/null @@ -1,81 +0,0 @@ -/* mdb_copy.c - memory-mapped database backup tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2012-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include "mdbx.h" - -static void -sighandle(int sig) -{ - (void) sig; -} - -int main(int argc,char * argv[]) -{ - int rc; - MDB_env *env = NULL; - const char *progname = argv[0], *act; - unsigned flags = MDB_RDONLY; - unsigned cpflags = 0; - - for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { - if (argv[1][1] == 'n' && argv[1][2] == '\0') - flags |= MDB_NOSUBDIR; - else if (argv[1][1] == 'c' && argv[1][2] == '\0') - cpflags |= MDB_CP_COMPACT; - else if (argv[1][1] == 'V' && argv[1][2] == '\0') { - printf("%s\n", MDB_VERSION_STRING); - exit(0); - } else - argc = 0; - } - - if (argc<2 || argc>3) { - fprintf(stderr, "usage: %s [-V] [-c] [-n] srcpath [dstpath]\n", progname); - exit(EXIT_FAILURE); - } - -#ifdef SIGPIPE - signal(SIGPIPE, sighandle); -#endif -#ifdef SIGHUP - signal(SIGHUP, sighandle); -#endif - signal(SIGINT, sighandle); - signal(SIGTERM, sighandle); - - act = "opening environment"; - rc = mdb_env_create(&env); - if (rc == MDB_SUCCESS) { - rc = mdb_env_open(env, argv[1], flags, 0640); - } - if (rc == MDB_SUCCESS) { - act = "copying"; - if (argc == 2) - rc = mdb_env_copyfd2(env, STDOUT_FILENO, cpflags); - else - rc = mdb_env_copy2(env, argv[2], cpflags); - } - if (rc) - fprintf(stderr, "%s: %s failed, error %d (%s)\n", - progname, act, rc, mdb_strerror(rc)); - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdb_dump.c b/mdb_dump.c deleted file mode 100644 index 0b5db58e..00000000 --- a/mdb_dump.c +++ /dev/null @@ -1,314 +0,0 @@ -/* mdb_dump.c - memory-mapped database dump tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include -#include -#include "mdbx.h" - -#define PRINT 1 -static int mode; - -typedef struct flagbit { - int bit; - char *name; -} flagbit; - -flagbit dbflags[] = { - { MDB_REVERSEKEY, "reversekey" }, - { MDB_DUPSORT, "dupsort" }, - { MDB_INTEGERKEY, "integerkey" }, - { MDB_DUPFIXED, "dupfixed" }, - { MDB_INTEGERDUP, "integerdup" }, - { MDB_REVERSEDUP, "reversedup" }, - { 0, NULL } -}; - -static volatile sig_atomic_t gotsig; - -static void dumpsig( int sig ) -{ - (void) sig; - gotsig = 1; -} - -static const char hexc[] = "0123456789abcdef"; - -static void hex(unsigned char c) -{ - putchar(hexc[c >> 4]); - putchar(hexc[c & 0xf]); -} - -static void text(MDB_val *v) -{ - unsigned char *c, *end; - - putchar(' '); - c = v->mv_data; - end = c + v->mv_size; - while (c < end) { - if (isprint(*c)) { - putchar(*c); - } else { - putchar('\\'); - hex(*c); - } - c++; - } - putchar('\n'); -} - -static void byte(MDB_val *v) -{ - unsigned char *c, *end; - - putchar(' '); - c = v->mv_data; - end = c + v->mv_size; - while (c < end) { - hex(*c++); - } - putchar('\n'); -} - -/* Dump in BDB-compatible format */ -static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) -{ - MDB_cursor *mc; - MDB_stat ms; - MDB_val key, data; - MDB_envinfo info; - unsigned int flags; - int rc, i; - - rc = mdb_dbi_flags(txn, dbi, &flags); - if (rc) return rc; - - rc = mdb_stat(txn, dbi, &ms); - if (rc) return rc; - - rc = mdb_env_info(mdb_txn_env(txn), &info); - if (rc) return rc; - - printf("VERSION=3\n"); - printf("format=%s\n", mode & PRINT ? "print" : "bytevalue"); - if (name) - printf("database=%s\n", name); - printf("type=btree\n"); - printf("mapsize=%zu\n", info.me_mapsize); - if (info.me_mapaddr) - printf("mapaddr=%p\n", info.me_mapaddr); - printf("maxreaders=%u\n", info.me_maxreaders); - - for (i=0; dbflags[i].bit; i++) - if (flags & dbflags[i].bit) - printf("%s=1\n", dbflags[i].name); - - printf("db_pagesize=%d\n", ms.ms_psize); - printf("HEADER=END\n"); - - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) return rc; - - while ((rc = mdb_cursor_get(mc, &key, &data, MDB_NEXT)) == MDB_SUCCESS) { - if (gotsig) { - rc = EINTR; - break; - } - if (mode & PRINT) { - text(&key); - text(&data); - } else { - byte(&key); - byte(&data); - } - } - printf("DATA=END\n"); - if (rc == MDB_NOTFOUND) - rc = MDB_SUCCESS; - - return rc; -} - -static void usage(char *prog) -{ - fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-a|-s subdb] dbpath\n", prog); - exit(EXIT_FAILURE); -} - -int main(int argc, char *argv[]) -{ - int i, rc; - MDB_env *env; - MDB_txn *txn; - MDB_dbi dbi; - char *prog = argv[0]; - char *envname; - char *subname = NULL; - int alldbs = 0, envflags = 0, list = 0; - - if (argc < 2) { - usage(prog); - } - - /* -a: dump main DB and all subDBs - * -s: dump only the named subDB - * -n: use NOSUBDIR flag on env_open - * -p: use printable characters - * -f: write to file instead of stdout - * -V: print version and exit - * (default) dump only the main DB - */ - while ((i = getopt(argc, argv, "af:lnps:V")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(0); - break; - case 'l': - list = 1; - /*FALLTHROUGH*/; - case 'a': - if (subname) - usage(prog); - alldbs++; - break; - case 'f': - if (freopen(optarg, "w", stdout) == NULL) { - fprintf(stderr, "%s: %s: reopen: %s\n", - prog, optarg, strerror(errno)); - exit(EXIT_FAILURE); - } - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 'p': - mode |= PRINT; - break; - case 's': - if (alldbs) - usage(prog); - subname = optarg; - break; - default: - usage(prog); - } - } - - if (optind != argc - 1) - usage(prog); - -#ifdef SIGPIPE - signal(SIGPIPE, dumpsig); -#endif -#ifdef SIGHUP - signal(SIGHUP, dumpsig); -#endif - signal(SIGINT, dumpsig); - signal(SIGTERM, dumpsig); - - envname = argv[optind]; - rc = mdb_env_create(&env); - if (rc) { - fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); - return EXIT_FAILURE; - } - - if (alldbs || subname) { - mdb_env_set_maxdbs(env, 2); - } - - rc = mdb_env_open(env, envname, envflags | MDB_RDONLY, 0664); - if (rc) { - fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - rc = mdb_open(txn, subname, 0, &dbi); - if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - if (alldbs) { - MDB_cursor *cursor; - MDB_val key; - int count = 0; - - rc = mdb_cursor_open(txn, dbi, &cursor); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { - char *str; - MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) - continue; - count++; - str = malloc(key.mv_size+1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; - rc = mdb_open(txn, str, 0, &db2); - if (rc == MDB_SUCCESS) { - if (list) { - printf("%s\n", str); - list++; - } else { - rc = dumpit(txn, db2, str); - if (rc) - break; - } - mdb_close(env, db2); - } - free(str); - if (rc) continue; - } - mdb_cursor_close(cursor); - if (!count) { - fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, envname); - rc = MDB_NOTFOUND; - } else if (rc == MDB_INCOMPATIBLE) { - /* LY: the record it not a named sub-db. */ - rc = MDB_SUCCESS; - } - } else { - rc = dumpit(txn, dbi, subname); - } - if (rc && rc != MDB_NOTFOUND) - fprintf(stderr, "%s: %s: %s\n", prog, envname, mdb_strerror(rc)); - - mdb_close(env, dbi); -txn_abort: - mdb_txn_abort(txn); -env_close: - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdb_load.c b/mdb_load.c deleted file mode 100644 index e2cddd53..00000000 --- a/mdb_load.c +++ /dev/null @@ -1,456 +0,0 @@ -/* mdb_load.c - memory-mapped database load tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include -#include "mdbx.h" - -#define PRINT 1 -#define NOHDR 2 -static int mode; - -static char *subname = NULL; - -static size_t lineno; -static int version; - -static int dbi_flags; - -static char *prog; - -static int Eof; - -static MDB_envinfo info; - -static MDB_val kbuf, dbuf; - -#define STRLENOF(s) (sizeof(s)-1) - -typedef struct flagbit { - int bit; - char *name; - int len; -} flagbit; - -#define S(s) s, STRLENOF(s) - -flagbit dbflags[] = { - { MDB_REVERSEKEY, S("reversekey") }, - { MDB_DUPSORT, S("dupsort") }, - { MDB_INTEGERKEY, S("integerkey") }, - { MDB_DUPFIXED, S("dupfixed") }, - { MDB_INTEGERDUP, S("integerdup") }, - { MDB_REVERSEDUP, S("reversedup") }, - { 0, NULL, 0 } -}; - -static void readhdr(void) -{ - char *ptr; - - dbi_flags = 0; - while (fgets(dbuf.mv_data, dbuf.mv_size, stdin) != NULL) { - lineno++; - if (!strncmp(dbuf.mv_data, "db_pagesize=", STRLENOF("db_pagesize=")) - || !strncmp(dbuf.mv_data, "duplicates=", STRLENOF("duplicates="))) { - /* LY: silently ignore information fields. */ - continue; - } else if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { - version=atoi((char *)dbuf.mv_data+STRLENOF("VERSION=")); - if (version > 3) { - fprintf(stderr, "%s: line %zd: unsupported VERSION %d\n", - prog, lineno, version); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "HEADER=END", STRLENOF("HEADER=END"))) { - break; - } else if (!strncmp(dbuf.mv_data, "format=", STRLENOF("format="))) { - if (!strncmp((char *)dbuf.mv_data+STRLENOF("FORMAT="), "print", STRLENOF("print"))) - mode |= PRINT; - else if (strncmp((char *)dbuf.mv_data+STRLENOF("FORMAT="), "bytevalue", STRLENOF("bytevalue"))) { - fprintf(stderr, "%s: line %zd: unsupported FORMAT %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("FORMAT=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "database=", STRLENOF("database="))) { - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - if (subname) free(subname); - subname = strdup((char *)dbuf.mv_data+STRLENOF("database=")); - } else if (!strncmp(dbuf.mv_data, "type=", STRLENOF("type="))) { - if (strncmp((char *)dbuf.mv_data+STRLENOF("type="), "btree", STRLENOF("btree"))) { - fprintf(stderr, "%s: line %zd: unsupported type %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("type=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "mapaddr=", STRLENOF("mapaddr="))) { - int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("mapaddr="), "%p", &info.me_mapaddr); - if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid mapaddr %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("mapaddr=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "mapsize=", STRLENOF("mapsize="))) { - int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("mapsize="), "%zu", &info.me_mapsize); - if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid mapsize %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("mapsize=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "maxreaders=", STRLENOF("maxreaders="))) { - int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("maxreaders="), "%u", &info.me_maxreaders); - if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid maxreaders %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("maxreaders=")); - exit(EXIT_FAILURE); - } - } else { - int i; - for (i=0; dbflags[i].bit; i++) { - if (!strncmp(dbuf.mv_data, dbflags[i].name, dbflags[i].len) && - ((char *)dbuf.mv_data)[dbflags[i].len] == '=') { - if (((char *)dbuf.mv_data)[dbflags[i].len+1] == '1') - dbi_flags |= dbflags[i].bit; - break; - } - } - if (!dbflags[i].bit) { - ptr = memchr(dbuf.mv_data, '=', dbuf.mv_size); - if (!ptr) { - fprintf(stderr, "%s: line %zd: unexpected format\n", - prog, lineno); - exit(EXIT_FAILURE); - } else { - *ptr = '\0'; - fprintf(stderr, "%s: line %zd: unrecognized keyword ignored: %s\n", - prog, lineno, (char *)dbuf.mv_data); - } - } - } - } -} - -static void badend(void) -{ - fprintf(stderr, "%s: line %zd: unexpected end of input\n", - prog, lineno); -} - -static int unhex(unsigned char *c2) -{ - int x, c; - x = *c2++ & 0x4f; - if (x & 0x40) - x -= 55; - c = x << 4; - x = *c2 & 0x4f; - if (x & 0x40) - x -= 55; - c |= x; - return c; -} - -static int readline(MDB_val *out, MDB_val *buf) -{ - unsigned char *c1, *c2, *end; - size_t len, l2; - int c; - - if (!(mode & NOHDR)) { - c = fgetc(stdin); - if (c == EOF) { - Eof = 1; - return EOF; - } - if (c != ' ') { - lineno++; - if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { -badend: - Eof = 1; - badend(); - return EOF; - } - if (c == 'D' && !strncmp(buf->mv_data, "ATA=END", STRLENOF("ATA=END"))) - return EOF; - goto badend; - } - } - if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { - Eof = 1; - return EOF; - } - lineno++; - - c1 = buf->mv_data; - len = strlen((char *)c1); - l2 = len; - - /* Is buffer too short? */ - while (c1[len-1] != '\n') { - buf->mv_data = realloc(buf->mv_data, buf->mv_size*2); - if (!buf->mv_data) { - Eof = 1; - fprintf(stderr, "%s: line %zd: out of memory, line too long\n", - prog, lineno); - return EOF; - } - c1 = buf->mv_data; - c1 += l2; - if (fgets((char *)c1, buf->mv_size+1, stdin) == NULL) { - Eof = 1; - badend(); - return EOF; - } - buf->mv_size *= 2; - len = strlen((char *)c1); - l2 += len; - } - c1 = c2 = buf->mv_data; - len = l2; - c1[--len] = '\0'; - end = c1 + len; - - if (mode & PRINT) { - while (c2 < end) { - if (*c2 == '\\') { - if (c2[1] == '\\') { - c1++; c2 += 2; - } else { - if (c2+3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) { - Eof = 1; - badend(); - return EOF; - } - *c1++ = unhex(++c2); - c2 += 2; - } - } else { - /* copies are redundant when no escapes were used */ - *c1++ = *c2++; - } - } - } else { - /* odd length not allowed */ - if (len & 1) { - Eof = 1; - badend(); - return EOF; - } - while (c2 < end) { - if (!isxdigit(*c2) || !isxdigit(c2[1])) { - Eof = 1; - badend(); - return EOF; - } - *c1++ = unhex(c2); - c2 += 2; - } - } - c2 = out->mv_data = buf->mv_data; - out->mv_size = c1 - c2; - - return 0; -} - -static void usage(void) -{ - fprintf(stderr, "usage: %s [-V] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", prog); - exit(EXIT_FAILURE); -} - -int main(int argc, char *argv[]) -{ - int i, rc; - MDB_env *env; - MDB_txn *txn; - MDB_cursor *mc; - MDB_dbi dbi; - char *envname; - int envflags = 0, putflags = 0; - - prog = argv[0]; - - if (argc < 2) { - usage(); - } - - /* -f: load file instead of stdin - * -n: use NOSUBDIR flag on env_open - * -s: load into named subDB - * -N: use NOOVERWRITE on puts - * -T: read plaintext - * -V: print version and exit - */ - while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(0); - break; - case 'f': - if (freopen(optarg, "r", stdin) == NULL) { - fprintf(stderr, "%s: %s: reopen: %s\n", - prog, optarg, strerror(errno)); - exit(EXIT_FAILURE); - } - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 's': - subname = strdup(optarg); - break; - case 'N': - putflags = MDB_NOOVERWRITE|MDB_NODUPDATA; - break; - case 'T': - mode |= NOHDR | PRINT; - break; - default: - usage(); - } - } - - if (optind != argc - 1) - usage(); - - dbuf.mv_size = 4096; - dbuf.mv_data = malloc(dbuf.mv_size); - - if (!(mode & NOHDR)) - readhdr(); - - envname = argv[optind]; - rc = mdb_env_create(&env); - if (rc) { - fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); - return EXIT_FAILURE; - } - - mdb_env_set_maxdbs(env, 2); - - if (info.me_maxreaders) - mdb_env_set_maxreaders(env, info.me_maxreaders); - - if (info.me_mapsize) - mdb_env_set_mapsize(env, info.me_mapsize); - - if (info.me_mapaddr) - envflags |= MDB_FIXEDMAP; - - rc = mdb_env_open(env, envname, envflags, 0664); - if (rc) { - fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - kbuf.mv_size = mdb_env_get_maxkeysize(env) * 2 + 2; - kbuf.mv_data = malloc(kbuf.mv_size); - - while(!Eof) { - MDB_val key, data; - int batch = 0; - - rc = mdb_txn_begin(env, NULL, 0, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - rc = mdb_open(txn, subname, dbi_flags|MDB_CREATE, &dbi); - if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - while(1) { - rc = readline(&key, &kbuf); - if (rc) /* rc == EOF */ - break; - - rc = readline(&data, &dbuf); - if (rc) { - fprintf(stderr, "%s: line %zd: failed to read key value\n", prog, lineno); - goto txn_abort; - } - - rc = mdb_cursor_put(mc, &key, &data, putflags); - if (rc == MDB_KEYEXIST && putflags) - continue; - if (rc) { - fprintf(stderr, "mdb_cursor_put failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - batch++; - if (batch == 100) { - rc = mdb_txn_commit(txn); - if (rc) { - fprintf(stderr, "%s: line %zd: txn_commit: %s\n", - prog, lineno, mdb_strerror(rc)); - goto env_close; - } - rc = mdb_txn_begin(env, NULL, 0, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - batch = 0; - } - } - rc = mdb_txn_commit(txn); - txn = NULL; - if (rc) { - fprintf(stderr, "%s: line %zd: txn_commit: %s\n", - prog, lineno, mdb_strerror(rc)); - goto env_close; - } - mdb_dbi_close(env, dbi); - if(!(mode & NOHDR)) - readhdr(); - } - -txn_abort: - mdb_txn_abort(txn); -env_close: - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdb_stat.c b/mdb_stat.c deleted file mode 100644 index d47ffe9e..00000000 --- a/mdb_stat.c +++ /dev/null @@ -1,299 +0,0 @@ -/* mdb_stat.c - memory-mapped database status tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include "mdbx.h" - -static void prstat(MDBX_stat *ms) -{ -#if 0 - printf(" Page size: %u\n", ms->base.ms_psize); -#endif - printf(" Tree depth: %u\n", ms->base.ms_depth); - printf(" Branch pages: %zu\n", ms->base.ms_branch_pages); - printf(" Leaf pages: %zu\n", ms->base.ms_leaf_pages); - printf(" Overflow pages: %zu\n", ms->base.ms_overflow_pages); - printf(" Entries: %zu\n", ms->base.ms_entries); -} - -static void usage(char *prog) -{ - fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-a|-s subdb] dbpath\n", prog); - exit(EXIT_FAILURE); -} - -int main(int argc, char *argv[]) -{ - int i, rc; - MDB_env *env; - MDB_txn *txn; - MDB_dbi dbi; - MDBX_stat mst; - MDBX_envinfo mei; - char *prog = argv[0]; - char *envname; - char *subname = NULL; - int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0, rdrinfo = 0; - - if (argc < 2) { - usage(prog); - } - - /* -a: print stat of main DB and all subDBs - * -s: print stat of only the named subDB - * -e: print env info - * -f: print freelist info - * -r: print reader info - * -n: use NOSUBDIR flag on env_open - * -V: print version and exit - * (default) print stat of only the main DB - */ - while ((i = getopt(argc, argv, "Vaefnrs:")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(0); - break; - case 'a': - if (subname) - usage(prog); - alldbs++; - break; - case 'e': - envinfo++; - break; - case 'f': - freinfo++; - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 'r': - rdrinfo++; - break; - case 's': - if (alldbs) - usage(prog); - subname = optarg; - break; - default: - usage(prog); - } - } - - if (optind != argc - 1) - usage(prog); - - envname = argv[optind]; - rc = mdb_env_create(&env); - if (rc) { - fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); - return EXIT_FAILURE; - } - - if (alldbs || subname) { - mdb_env_set_maxdbs(env, 4); - } - - rc = mdb_env_open(env, envname, envflags | MDB_RDONLY, 0664); - if (rc) { - fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - if (envinfo) { - (void)mdbx_env_stat(env, &mst, sizeof(mst)); - (void)mdbx_env_info(env, &mei, sizeof(mei)); - printf("Environment Info\n"); - printf(" Map address: %p\n", mei.base.me_mapaddr); - printf(" Map size: %zu\n", mei.base.me_mapsize); - printf(" Page size: %u\n", mst.base.ms_psize); - printf(" Max pages: %zu\n", mei.base.me_mapsize / mst.base.ms_psize); - printf(" Number of pages used: %zu\n", mei.base.me_last_pgno+1); - printf(" Last transaction ID: %zu\n", mei.base.me_last_txnid); - printf(" Tail transaction ID: %zu (%zi)\n", - mei.me_tail_txnid, mei.me_tail_txnid - mei.base.me_last_txnid); - printf(" Max readers: %u\n", mei.base.me_maxreaders); - printf(" Number of readers used: %u\n", mei.base.me_numreaders); - } else { - /* LY: zap warnings from gcc */ - memset(&mst, 0, sizeof(mst)); - memset(&mei, 0, sizeof(mei)); - } - - if (rdrinfo) { - printf("Reader Table Status\n"); - rc = mdb_reader_list(env, (MDB_msg_func *)fputs, stdout); - if (rdrinfo > 1) { - int dead; - mdb_reader_check(env, &dead); - printf(" %d stale readers cleared.\n", dead); - rc = mdb_reader_list(env, (MDB_msg_func *)fputs, stdout); - } - if (!(subname || alldbs || freinfo)) - goto env_close; - } - - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - if (freinfo) { - MDB_cursor *cursor; - MDB_val key, data; - size_t pages = 0, *iptr; - size_t reclaimable = 0; - - printf("Freelist Status\n"); - dbi = 0; - rc = mdb_cursor_open(txn, dbi, &cursor); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); - if (rc) { - fprintf(stderr, "mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - prstat(&mst); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - iptr = data.mv_data; - pages += *iptr; - if (envinfo && mei.me_tail_txnid > *(size_t *)key.mv_data) - reclaimable += *iptr; - if (freinfo > 1) { - char *bad = ""; - size_t pg, prev; - ssize_t i, j, span = 0; - j = *iptr++; - for (i = j, prev = 1; --i >= 0; ) { - pg = iptr[i]; - if (pg <= prev) - bad = " [bad sequence]"; - prev = pg; - pg += span; - for (; i >= span && iptr[i-span] == pg; span++, pg++) ; - } - printf(" Transaction %zu, %zd pages, maxspan %zd%s\n", - *(size_t *)key.mv_data, j, span, bad); - if (freinfo > 2) { - for (--j; j >= 0; ) { - pg = iptr[j]; - for (span=1; --j >= 0 && iptr[j] == pg+span; span++) ; - if (span>1) - printf(" %9zu[%zd]\n", pg, span); - else - printf(" %9zu\n", pg); - } - } - } - } - mdb_cursor_close(cursor); - if (envinfo) { - size_t value = mei.base.me_mapsize / mst.base.ms_psize; - double percent = value / 100.0; - printf("Page Allocation Info\n"); - printf(" Max pages: %9zu 100%%\n", value); - - value = mei.base.me_last_pgno+1; - printf(" Number of pages used: %zu %.1f%%\n", value, value / percent); - - value = mei.base.me_mapsize / mst.base.ms_psize - (mei.base.me_last_pgno+1); - printf(" Remained: %zu %.1f%%\n", value, value / percent); - - value = mei.base.me_last_pgno+1 - pages; - printf(" Used now: %zu %.1f%%\n", value, value / percent); - - value = pages; - printf(" Unallocated: %zu %.1f%%\n", value, value / percent); - - value = pages - reclaimable; - printf(" Detained: %zu %.1f%%\n", value, value / percent); - - value = reclaimable; - printf(" Reclaimable: %zu %.1f%%\n", value, value / percent); - - value = mei.base.me_mapsize / mst.base.ms_psize - (mei.base.me_last_pgno+1) + reclaimable; - printf(" Available: %zu %.1f%%\n", value, value / percent); - } else - printf(" Free pages: %zu\n", pages); - } - - rc = mdb_open(txn, subname, 0, &dbi); - if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); - if (rc) { - fprintf(stderr, "mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - printf("Status of %s\n", subname ? subname : "Main DB"); - prstat(&mst); - - if (alldbs) { - MDB_cursor *cursor; - MDB_val key; - - rc = mdb_cursor_open(txn, dbi, &cursor); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { - char *str; - MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) - continue; - str = malloc(key.mv_size+1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; - rc = mdb_open(txn, str, 0, &db2); - if (rc == MDB_SUCCESS) - printf("Status of %s\n", str); - free(str); - if (rc) continue; - rc = mdbx_stat(txn, db2, &mst, sizeof(mst)); - if (rc) { - fprintf(stderr, "mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - prstat(&mst); - mdb_close(env, db2); - } - mdb_cursor_close(cursor); - } - - if (rc == MDB_NOTFOUND) - rc = MDB_SUCCESS; - - mdb_close(env, dbi); -txn_abort: - mdb_txn_abort(txn); -env_close: - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdbx.c b/mdbx.c index 66c377d8..b96857c1 100644 --- a/mdbx.c +++ b/mdbx.c @@ -1,7 +1,13 @@ /* * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. + * + * This code is derived from "LMDB engine" written by + * Howard Chu (Symas Corporation), which itself derived from btree.c + * written by Martin Hedenfalk. + * + * --- + * + * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -10,415 +16,11265 @@ * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at * . + * + * --- + * + * Portions Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include + #include "mdbx.h" -int mdb_runtime_flags = MDBX_DBG_PRINT +#ifndef MDB_DEBUG +#define MDB_DEBUG 0 +#endif + +/* LY: Please do not ask us for Windows support, just never! + * But you can make a fork for Windows, or become maintainer for FreeBSD... */ +#ifndef __gnu_linux__ +#warning "libmdbx supports only GNU Linux" +#endif + +#if !defined(__GNUC__) || !__GNUC_PREREQ(4, 2) +/* LY: Actualy libmdbx was not tested with compilers + * older than GCC 4.4 (from RHEL6). + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +#warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." +#endif + +#if !defined(__GNU_LIBRARY__) || !__GLIBC_PREREQ(2, 12) +/* LY: Actualy libmdbx was not tested with something + * older than glibc 2.12 (from RHEL6). + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old systems. + */ +#warning "libmdbx required at least GLIBC 2.12." +#endif + #if MDB_DEBUG - | MDBX_DBG_ASSERT +#undef NDEBUG +#endif + +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_FILE_H +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) +#include +#include /* defines BYTE_ORDER on HPUX and Solaris */ +#endif + +#ifndef _POSIX_SYNCHRONIZED_IO +#define fdatasync fsync +#endif + +#ifndef BYTE_ORDER +#if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && \ + !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) +/* Solaris just defines one or the other */ +#define LITTLE_ENDIAN 1234 +#define BIG_ENDIAN 4321 +#ifdef _LITTLE_ENDIAN +#define BYTE_ORDER LITTLE_ENDIAN +#else +#define BYTE_ORDER BIG_ENDIAN +#endif +#else +#define BYTE_ORDER __BYTE_ORDER +#endif +#endif + +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#endif +#ifndef BIG_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define MISALIGNED_OK 1 +#endif + +#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) +#error "Unknown or unsupported endianness (BYTE_ORDER)" +#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +#error "Two's complement, reasonably sized integer types, please" +#endif + +#include "./barriers.h" +#include "./midl.h" +#include "./reopen.h" + +/** Search for an ID in an IDL. + * @param[in] ids The IDL to search. + * @param[in] id The ID to search for. + * @return The index of the first ID greater than or equal to \b id. + */ +static unsigned mdbx_midl_search(MDB_IDL ids, MDB_ID id); + +/** Allocate an IDL. + * Allocates memory for an IDL of the given size. + * @return IDL on success, NULL on failure. + */ +static MDB_IDL mdbx_midl_alloc(int num); + +/** Free an IDL. + * @param[in] ids The IDL to free. + */ +static void mdbx_midl_free(MDB_IDL ids); + +/** Shrink an IDL. + * Return the IDL to the default size if it has grown larger. + * @param[in,out] idp Address of the IDL to shrink. + */ +static void mdbx_midl_shrink(MDB_IDL *idp); + +/** Make room for num additional elements in an IDL. + * @param[in,out] idp Address of the IDL. + * @param[in] num Number of elements to make room for. + * @return 0 on success, ENOMEM on failure. + */ +static int mdbx_midl_need(MDB_IDL *idp, unsigned num); + +/** Append an ID onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] id The ID to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id); + +/** Append an IDL onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] app The IDL to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app); + +/** Append an ID range onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] id The lowest ID to append. + * @param[in] n Number of IDs to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n); + +/** Merge an IDL onto an IDL. The destination IDL must be big enough. + * @param[in] idl The IDL to merge into. + * @param[in] merge The IDL to merge. + */ +static void mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge); + +/** Sort an IDL. + * @param[in,out] ids The IDL to sort. + */ +static void mdbx_midl_sort(MDB_IDL ids); + +/** Search for an ID in an ID2L. + * @param[in] ids The ID2L to search. + * @param[in] id The ID to search for. + * @return The index of the first ID2 whose \b mid member is greater than + * or equal to \b id. + */ +static unsigned mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id); + +/** Insert an ID2 into a ID2L. + * @param[in,out] ids The ID2L to insert into. + * @param[in] id The ID2 to insert. + * @return 0 on success, -1 if the ID was already present in the ID2L. + */ +static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id); + +/** Append an ID2 into a ID2L. + * @param[in,out] ids The ID2L to append into. + * @param[in] id The ID2 to append. + * @return 0 on success, -2 if the ID2L is too big. + */ +static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id); + +int mdbx_runtime_flags = MDBX_DBG_PRINT +#if MDB_DEBUG + | MDBX_DBG_ASSERT #endif #if MDB_DEBUG > 1 - | MDBX_DBG_TRACE + | MDBX_DBG_TRACE #endif #if MDB_DEBUG > 2 - | MDBX_DBG_AUDIT + | MDBX_DBG_AUDIT #endif #if MDB_DEBUG > 3 - | MDBX_DBG_EXTRA + | MDBX_DBG_EXTRA #endif - ; + ; -static MDBX_debug_func *mdb_debug_logger; +static MDBX_debug_func *mdbx_debug_logger; -int mdbx_setup_debug(int flags, MDBX_debug_func* logger, long edge_txn); +int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); -#include "mdb.c" +/** Features under development */ +#ifndef MDB_DEVEL +#define MDB_DEVEL 0 +#endif -int __cold -mdbx_setup_debug(int flags, MDBX_debug_func* logger, long edge_txn) { - unsigned ret = mdb_runtime_flags; - if (flags != (int) MDBX_DBG_DNT) - mdb_runtime_flags = flags; - if (logger != (MDBX_debug_func*) MDBX_DBG_DNT) - mdb_debug_logger = logger; - if (edge_txn != (long) MDBX_DBG_DNT) { +/** Wrapper around __func__, which is a C99 feature */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define mdbx_func_ __func__ +#elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) +#define mdbx_func_ __FUNCTION__ +#else +/* If a debug message says (), update the #if statements above */ +#define mdbx_func_ "" +#endif + +/** Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0. + */ +#ifndef MDB_USE_ROBUST +/* Howard Chu: Android currently lacks Robust Mutex support */ +#if defined(EOWNERDEAD) && \ + !defined(ANDROID) /* LY: glibc before 2.10 has a troubles with Robust \ + Mutex too. */ \ + && __GLIBC_PREREQ(2, 10) +#define MDB_USE_ROBUST 1 +#else +#define MDB_USE_ROBUST 0 +#endif +#endif /* MDB_USE_ROBUST */ + +/* Internal error codes, not exposed outside liblmdb */ +#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) + +/** Mutex for the reader table (rw = r) or write transaction (rw = w). + */ +#define MDB_MUTEX(env, rw) (&(env)->me_txns->mti_##rw##mutex) + +/** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int + +/** A value for an invalid file handle. + * Mainly used to initialize file variables and signify that they are + * unused. + */ +#define INVALID_HANDLE_VALUE (-1) + +/** Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. + */ +#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) + +/** @} */ + +static int mdbx_mutex_lock(MDB_env *env, pthread_mutex_t *mutex); +static int mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); +static void mdbx_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex); + +/** A page number in the database. + * Note that 64 bit page numbers are overkill, since pages themselves + * already represent 12-13 bits of addressable memory, and the OS will + * always limit applications to a maximum of 63 bits of address space. + * + * @note In the #MDB_node structure, we only store 48 bits of this value, + * which thus limits us to only 60 bits of addressable data. + */ +typedef MDB_ID pgno_t; + +/** A transaction ID. + * See struct MDB_txn.mt_txnid for details. + */ +typedef MDB_ID txnid_t; + +/** @defgroup debug Debug Macros + * @{ + */ +/** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) +/** @} */ + +/** @brief The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * #MDB_page.%mp_upper. + * + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. + */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + +/** The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. + */ +#define MDB_MINKEYS 2 + +/** A stamp that identifies a file as an LMDB file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. + */ +#define MDB_MAGIC 0xBEEFC0DE + +/** The version number for a database's datafile format. */ +#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) +/** The version number for a database's lockfile format. */ +#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) + +/** @brief The max size of a key we can write, or 0 for computed max. + * + * This macro should normally be left alone or set to 0. + * Note that a database with big keys or dupsort data cannot be + * reliably modified by a liblmdb which uses a smaller max. + * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. + * + * Other values are allowed, for backwards compat. However: + * A value bigger than the computed max can break if you do not + * know what you are doing, and liblmdb <= 0.9.10 can break when + * modifying a DB with keys/dupsort data bigger than its max. + * + * Data items in an #MDB_DUPSORT database are also limited to + * this size, since they're actually keys of a sub-DB. Keys and + * #MDB_DUPSORT data items must fit on a node in a regular page. + */ +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) +#endif + +/** The maximum size of a key we can write to the environment. */ +#if MDB_MAXKEYSIZE +#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) +#else +#define ENV_MAXKEY(env) ((env)->me_maxkey_limit) +#endif /* MDB_MAXKEYSIZE */ + +/** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. + */ +#define MAXDATASIZE 0xffffffffUL + +/** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) +/** A key buffer. + * @ingroup debug + * This is used for printing a hex dump of a key's contents. + */ +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE * 2 + 1] +/** Display a key in hex. + * @ingroup debug + * Invoke a function to display a key in hex. + */ +#define DKEY(x) mdbx_dkey(x, kbuf) + +/** An invalid page number. + * Mainly used to denote an empty tree. + */ +#define P_INVALID (~(pgno_t)0) + +/** Test if the flags \b f are set in a flag word \b w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + +/** Round \b n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + +/** Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. + */ +typedef uint16_t indx_t; + +/** Default size of memory map. + * This is certainly too small for any actual applications. Apps should + *always set + * the size explicitly using #mdbx_env_set_mapsize(). + */ +#define DEFAULT_MAPSIZE 1048576 + +/** @defgroup readers Reader Lock Table + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + *read + * transactions started by the same thread need no further locking to + *proceed. + * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific + *data. + * + * No reader table is used if the database is on a read-only filesystem, + *or + * if #MDB_NOLOCK is set. + * + * Since the database uses multi-version concurrency control, readers + *don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old + *transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with + *the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the + *oldest + * outstanding reader transaction. Any freed pages older than this will + *be + * reclaimed by the writer. The writer doesn't use any locks when + *scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for + *correct + * operation - all we need is to know the upper bound on the oldest + *reader, + * we don't care at all about the newest reader. So the only consequence + *of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, + *because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages + *from + * many old transactions together. + * @{ + */ +/** Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. 126 readers plus a + * couple mutexes fit exactly into 8KB on my development machine. + * Applications should set the table size using + *#mdbx_env_set_maxreaders(). + */ +#define DEFAULT_READERS 126 + +/** The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * @note We currently don't check for stale records. We simply re-init + * the table when we know that we're the only process opening the + * lock file. + */ +typedef struct MDB_rxbody { + /** Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. + */ + volatile txnid_t mrb_txnid; + /** The process ID of the process owning this reader txn. */ + volatile pid_t mrb_pid; + /** The thread ID of the thread owning this txn. */ + volatile pthread_t mrb_tid; +} MDB_rxbody; + +/** The actual reader record, with cacheline padding. */ +typedef struct MDB_reader { + union { + MDB_rxbody mrx; +/** shorthand for mrb_txnid */ +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /** cache line alignment */ + char pad[(sizeof(MDB_rxbody) + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1)]; + } mru; +} MDB_reader; + +/** The header for the reader table. + * The table resides in a memory-mapped file. (This is a different file + * than is used for the main database.) + * + * For POSIX the actual mutexes reside in the shared memory of this + * mapped file. On Windows, mutexes are named objects allocated by the + * kernel; we store the mutex names in this mapped file so that other + * processes can grab them. This same approach is also used on + * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support + * process-shared POSIX mutexes. For these cases where a named object + * is used, the object name is derived from a 64 bit FNV hash of the + * environment pathname. As such, naming collisions are extremely + * unlikely. If a collision occurs, the results are unpredictable. + */ +typedef struct MDB_txbody { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mtb_magic; + /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ + uint32_t mtb_format; + /** Mutex protecting access to this table. + * This is the #MDB_MUTEX(env,r) reader table lock. + */ + pthread_mutex_t mtb_rmutex; + /** The ID of the last transaction committed to the database. + * This is recorded here only for convenience; the value can always + * be determined by reading the main database meta pages. + */ + volatile txnid_t mtb_txnid; + /** The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. + */ + volatile unsigned mtb_numreaders; +} MDB_txbody; + +/** The actual reader table definition. */ +typedef struct MDB_txninfo { + union { + MDB_txbody mtb; +#define mti_magic mt1.mtb.mtb_magic +#define mti_format mt1.mtb.mtb_format +#define mti_rmutex mt1.mtb.mtb_rmutex +#define mti_rmname mt1.mtb.mtb_rmname +#define mti_txnid mt1.mtb.mtb_txnid +#define mti_numreaders mt1.mtb.mtb_numreaders + char pad[(sizeof(MDB_txbody) + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1)]; + } mt1; + union { + pthread_mutex_t mt2_wmutex; +#define mti_wmutex mt2.mt2_wmutex + char pad[(sizeof(pthread_mutex_t) + CACHELINE_SIZE - 1) & + ~(CACHELINE_SIZE - 1)]; + } mt2; + MDB_reader mti_readers[1]; +} MDB_txninfo; + +/** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + ((uint32_t)((MDB_LOCK_VERSION) /* Flags which describe functionality */ \ + + (0 /* SYSV_SEM_FLAG */ << 18) + (1 /* MDB_PIDLOCK */ << 16))) +/** @} */ + +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. + */ +typedef struct MDB_page { +#define mp_pgno mp_p.p_pgno +#define mp_next mp_p.p_next + union { + pgno_t p_pgno; /**< page number */ + struct MDB_page *p_next; /**< for in-memory list of freed pages */ + } mp_p; + uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ + /** @defgroup mdbx_page Page Flags + * @ingroup internal + * Flags for the page headers. + * @{ + */ +#define P_BRANCH 0x01 /**< branch page */ +#define P_LEAF 0x02 /**< leaf page */ +#define P_OVERFLOW 0x04 /**< overflow page */ +#define P_META 0x08 /**< meta page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ +#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ +#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ + /** @} */ + uint16_t mp_flags; /**< @ref mdbx_page */ +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union { + struct { + indx_t pb_lower; /**< lower bound of free space */ + indx_t pb_upper; /**< upper bound of free space */ + } pb; + uint32_t pb_pages; /**< number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[1]; /**< dynamic size */ +} MDB_page; + +/** Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs)) + +/** Address of first usable data byte in a page, after the header */ +#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + +/** ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) + +/** Number of nodes on a page */ +#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) + +/** The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + +/** The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) \ + (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) +/** The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. + */ +#define FILL_THRESHOLD 250 + +/** Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) +/** Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) +/** Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) +/** Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) +/** Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + +/** The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) + +/** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ +#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) + +/** Header for a single key/data pair within a page. + * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. + * We guarantee 2-byte alignment for 'MDB_node's. + * + * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. #F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just #F_SUBDATA). + */ +typedef struct MDB_node { +/** part of data size or pgno + * @{ */ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short mn_lo, mn_hi; +#else + unsigned short mn_hi, mn_lo; +#endif +/** @} */ +/** @defgroup mdbx_node Node Flags + * @ingroup internal + * Flags for node headers. + * @{ + */ +#define F_BIGDATA 0x01 /**< data put on overflow page */ +#define F_SUBDATA 0x02 /**< data is a sub-database */ +#define F_DUPDATA 0x04 /**< data has duplicates */ + +/** valid flags for #mdbx_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDB_RESERVE | MDB_APPEND) + + /** @} */ + unsigned short mn_flags; /**< @ref mdbx_node */ + unsigned short mn_ksize; /**< key size */ + char mn_data[1]; /**< key and data are appended here */ +} MDB_node; + +/** Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDB_node, mn_data) + +/** Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + +/** Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. + */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) + +/** Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. + */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) + +/** Address of node \b i in page \b p */ +#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) + +/** Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + +/** Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + +/** Get the page number pointed to by a branch node */ +#define NODEPGNO(node) \ + ((node)->mn_lo | ((pgno_t)(node)->mn_hi << 16) | \ + (PGNO_TOPWORD ? ((pgno_t)(node)->mn_flags << PGNO_TOPWORD) : 0)) +/** Set the page number in a branch node */ +#define SETPGNO(node, pgno) \ + do { \ + (node)->mn_lo = (pgno)&0xffff; \ + (node)->mn_hi = (pgno) >> 16; \ + if (PGNO_TOPWORD) \ + (node)->mn_flags = (pgno) >> PGNO_TOPWORD; \ + } while (0) + +/** Get the size of the data in a leaf node */ +#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) +/** Set the size of the data for a leaf node */ +#define SETDSZ(node, size) \ + do { \ + (node)->mn_lo = (size)&0xffff; \ + (node)->mn_hi = (size) >> 16; \ + } while (0) +/** The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + +/** Copy a page number from src to dst */ +#ifdef MISALIGNED_OK +#define COPY_PGNO(dst, src) dst = src +#elif SIZE_MAX > 4294967295UL +#define COPY_PGNO(dst, src) \ + do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d++ = *s++; \ + *d++ = *s++; \ + *d = *s; \ + } while (0) +#else +#define COPY_PGNO(dst, src) \ + do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d = *s; \ + } while (0) +#endif /* MISALIGNED_OK */ + +/** The address of a key in a LEAF2 page. + * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate + *sub-DBs. + * There are no node headers, keys are stored contiguously. + */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) + +/** Set the \b node's key into \b keyptr, if requested. */ +#define MDB_GET_KEY(node, keyptr) \ + { \ + if ((keyptr) != NULL) { \ + (keyptr)->mv_size = NODEKSZ(node); \ + (keyptr)->mv_data = NODEKEY(node); \ + } \ + } + +/** Set the \b node's key into \b key. */ +#define MDB_GET_KEY2(node, key) \ + { \ + key.mv_size = NODEKSZ(node); \ + key.mv_data = NODEKEY(node); \ + } + +/** Information about a single database in the environment. */ +typedef struct MDB_db { + uint32_t md_xsize; /**< also ksize for LEAF2 pages */ + uint16_t md_flags; /**< @ref mdbx_dbi_open */ + uint16_t md_depth; /**< depth of this tree */ + pgno_t md_branch_pages; /**< number of internal pages */ + pgno_t md_leaf_pages; /**< number of leaf pages */ + pgno_t md_overflow_pages; /**< number of overflow pages */ + size_t md_entries; /**< number of data items */ + pgno_t md_root; /**< the root page of this tree */ +} MDB_db; + +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) +/** #mdbx_dbi_open() flags */ +#define VALID_FLAGS \ + (MDB_REVERSEKEY | MDB_DUPSORT | MDB_INTEGERKEY | MDB_DUPFIXED | \ + MDB_INTEGERDUP | MDB_REVERSEDUP | MDB_CREATE) + +/** Handle for the DB used to track free pages. */ +#define FREE_DBI 0 +/** Handle for the default DB. */ +#define MAIN_DBI 1 +/** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + +/** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 + +/** Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). + */ +typedef struct MDB_meta { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mm_magic; + /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ + uint32_t mm_version; + void *mm_address; /**< address for fixed mapping */ + size_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ + /** The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_xsize +/** Any persistent environment flags. @ref mdbx_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; + volatile txnid_t mm_txnid; /**< txnid that committed this page */ +#define MDB_DATASIGN_NONE 0 +#define MDB_DATASIGN_WEAK 1 + volatile uint64_t mm_datasync_sign; +#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) +#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) + +#if MDBX_MODE_ENABLED + volatile mdbx_canary mm_canary; +#endif +} MDB_meta; + +/** Buffer for a stack-allocated meta page. + * The members define size and alignment, and silence type + * aliasing warnings. They are not used directly; that could + * mean incorrectly using several union members in parallel. + */ +typedef union MDB_metabuf { + MDB_page mb_page; + struct { + char mm_pad[PAGEHDRSZ]; + MDB_meta mm_meta; + } mb_metabuf; +} MDB_metabuf; + +/** Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. + */ +typedef struct MDB_dbx { + MDB_val md_name; /**< name of the database */ + MDB_cmp_func *md_cmp; /**< function for comparing keys */ + MDB_cmp_func *md_dcmp; /**< function for comparing data items */ + MDB_rel_func *md_rel; /**< user relocate function */ + void *md_relctx; /**< user-provided context for md_rel */ +} MDB_dbx; + +#if MDBX_MODE_ENABLED +#define MDBX_MODE_SALT 0 +#else +#error !? +#endif + +/** A database transaction. + * Every operation requires a transaction handle. + */ +struct MDB_txn { +#define MDBX_MT_SIGNATURE (0x93D53A31 ^ MDBX_MODE_SALT) + unsigned mt_signature; + MDB_txn *mt_parent; /**< parent of a nested txn */ + /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_child; + pgno_t mt_next_pgno; /**< next unallocated page */ + /** The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. + */ + txnid_t mt_txnid; + MDB_env *mt_env; /**< the DB environment */ + /** The list of reclaimed txns from freeDB */ + MDB_IDL mt_lifo_reclaimed; + /** The list of pages that became unused during this transaction. + */ + MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; + /** Number of loose pages (#mt_loose_pgs) */ + int mt_loose_count; + /** The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. + */ + MDB_IDL mt_spill_pgs; + union { + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; + } mt_u; + /** Array of records for each DB known in the environment. */ + MDB_dbx *mt_dbxs; + /** Array of MDB_db records for each known DB */ + MDB_db *mt_dbs; + /** Array of sequence numbers for each DB handle */ + unsigned *mt_dbiseqs; +/** @defgroup mt_dbflag Transaction DB Flags + * @ingroup internal + * @{ + */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ + /** @} */ + /** In write txns, array of cursors for each DB */ + MDB_cursor **mt_cursors; + /** Array of flags for each DB */ + unsigned char *mt_dbflags; + /** Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. + */ + MDB_dbi mt_numdbs; + +/** @defgroup mdbx_txn Transaction Flags + * @ingroup internal + * @{ + */ +/** #mdbx_txn_begin() flags */ +#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY) +#define MDB_TXN_NOMETASYNC \ + MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ +#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ +#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ + /* internal txn flags */ +#define MDB_TXN_WRITEMAP \ + MDB_WRITEMAP /**< copy of #MDB_env flag in writers \ + */ +#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ +/** most operations on the txn are currently illegal */ +#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) + /** @} */ + unsigned mt_flags; /**< @ref mdbx_txn */ + /** #dirty_list room: Array size - \#dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ + unsigned mt_dirty_room; + +#if MDBX_MODE_ENABLED + mdbx_canary mt_canary; +#endif +}; + +/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. + */ +#define CURSOR_STACK 32 + +struct MDB_xcursor; + +/** Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. #MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a #P_SUBP page can be stale. + * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). + */ +struct MDB_cursor { +#define MDBX_MC_SIGNATURE (0xFE05D5B1 ^ MDBX_MODE_SALT) +#define MDBX_MC_READY4CLOSE (0x2817A047 ^ MDBX_MODE_SALT) +#define MDBX_MC_WAIT4EOT (0x90E297A7 ^ MDBX_MODE_SALT) + unsigned mc_signature; + /** Next cursor on this DB in this txn */ + MDB_cursor *mc_next; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; + /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + struct MDB_xcursor *mc_xcursor; + /** The transaction that owns this cursor */ + MDB_txn *mc_txn; + /** The database handle this cursor operates on */ + MDB_dbi mc_dbi; + /** The database record for this cursor */ + MDB_db *mc_db; + /** The database auxiliary record for this cursor */ + MDB_dbx *mc_dbx; + /** The @ref mt_dbflag for this database */ + unsigned char *mc_dbflag; + unsigned short mc_snum; /**< number of pushed pages */ + unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ + /** @defgroup mdbx_cursor Cursor Flags + * @ingroup internal + * Cursor state flags. + * @{ + */ +#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ +#define C_EOF 0x02 /**< No more data */ +#define C_SUB 0x04 /**< Cursor is a sub-cursor */ +#define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ + /** @} */ + unsigned mc_flags; /**< @ref mdbx_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +}; + +/** Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. + */ +typedef struct MDB_xcursor { + /** A sub-cursor for traversing the Dup DB */ + MDB_cursor mx_cursor; + /** The database record for this Dup DB */ + MDB_db mx_db; + /** The auxiliary DB record for this Dup DB */ + MDB_dbx mx_dbx; + /** The @ref mt_dbflag for this Dup DB */ + unsigned char mx_dbflag; +} MDB_xcursor; + +/** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + +/** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed + * when the node which contains the sub-page may have moved. Called + * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. + */ +#define XCURSOR_REFRESH(mc, mp, ki) \ + do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node = NODEPTR(xr_pg, ki); \ + if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ + } while (0) + +/** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ +} MDB_pgstate; + +/** Context for deferred cleanup of reader's threads. + * to avoid https://github.com/ReOpen/ReOpenLDAP/issues/48 */ +typedef struct MDBX_rthc { + struct MDBX_rthc *rc_next; + pthread_t rc_thread; + MDB_reader *rc_reader; +} MDBX_rthc; + +static MDBX_rthc *mdbx_rthc_get(pthread_key_t key); + +/** The database environment. */ +struct MDB_env { +#define MDBX_ME_SIGNATURE (0x9A899641 ^ MDBX_MODE_SALT) + unsigned me_signature; + HANDLE me_fd; /**< The main data file */ + HANDLE me_lfd; /**< The lock file */ + /** Failed to update the meta page. Probably an I/O error. */ +#define MDB_FATAL_ERROR 0x80000000U +/** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U +/** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U + uint32_t me_flags; /**< @ref mdbx_env */ + unsigned me_psize; /**< DB page size, inited from me_os_psize */ + unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */ + unsigned me_maxreaders; /**< size of the reader table */ + /** Max #MDB_txninfo.%mti_numreaders of interest to #mdbx_env_close() */ + unsigned me_close_readers; + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + pid_t me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file, never NULL */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ + pthread_key_t me_txkey; /**< thread-key for readers */ + txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +#define me_pglast me_pgstate.mf_pglast +#define me_pghead me_pgstate.mf_pghead + MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ + /** IDL of pages that became unused in a write txn */ + MDB_IDL me_free_pgs; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + unsigned me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned me_nodemax; + unsigned me_maxkey_limit; /**< max size of a key */ + int me_live_reader; /**< have liveness lock in reader table */ + void *me_userctx; /**< User-settable context */ #if MDB_DEBUG - mdb_debug_edge = edge_txn; + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ #endif + uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last + mdbx_env_sync() */ + uint64_t + me_sync_threshold; /**< Treshold of above to force synchronous flush */ +#if MDBX_MODE_ENABLED + MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ +#endif +#ifdef USE_VALGRIND + int me_valgrind_handle; +#endif +}; + +/** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ +} MDB_ntxn; + +/** max number of pages to commit in one writev() call */ +#define MDB_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +#undef MDB_COMMIT_PAGES +#define MDB_COMMIT_PAGES IOV_MAX +#endif + +/** max bytes to write in one call */ +#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) + +/** Check \b txn and \b dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + +/** Check for misused \b dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +#define METAPAGE_1(env) (&((MDB_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) + +#define METAPAGE_2(env) \ + (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) + +static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); +static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, + MDB_page **mp); +static int mdbx_page_touch(MDB_cursor *mc); +static int mdbx_cursor_touch(MDB_cursor *mc); + +#define MDB_END_NAMES \ + { \ + "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ + "fail-beginchild" \ + } +enum { + /* mdbx_txn_end operation number, for logging */ + MDB_END_COMMITTED, + MDB_END_EMPTY_COMMIT, + MDB_END_ABORT, + MDB_END_RESET, + MDB_END_RESET_TMP, + MDB_END_FAIL_BEGIN, + MDB_END_FAIL_BEGINCHILD +}; +#define MDB_END_OPMASK 0x0F /**< mask for #mdbx_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ +#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ +static int mdbx_txn_end(MDB_txn *txn, unsigned mode); + +static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); +#define MDB_PS_MODIFY 1 +#define MDB_PS_ROOTONLY 2 +#define MDB_PS_FIRST 4 +#define MDB_PS_LAST 8 +static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags); +static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); + +#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ +static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned nflags); + +static int mdbx_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); +static void mdbx_env_close0(MDB_env *env); + +static MDB_node *mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, + MDB_val *data, pgno_t pgno, unsigned flags); +static void mdbx_node_del(MDB_cursor *mc, int ksize); +static void mdbx_node_shrink(MDB_page *mp, indx_t indx); +static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); +static int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); +static size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); +static size_t mdbx_branch_size(MDB_env *env, MDB_val *key); + +static int mdbx_rebalance(MDB_cursor *mc); +static int mdbx_update_key(MDB_cursor *mc, MDB_val *key); + +static void mdbx_cursor_pop(MDB_cursor *mc); +static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp); + +static int mdbx_cursor_del0(MDB_cursor *mc); +static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags); +static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right); +static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op); +static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op); +static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp); +static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); +static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); + +static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, + MDB_xcursor *mx); +static void mdbx_xcursor_init0(MDB_cursor *mc); +static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node); +static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); + +static int mdbx_drop0(MDB_cursor *mc, int subs); +static int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); + +/** @cond */ +static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, + mdbx_cmp_int_a2, mdbx_cmp_int_ua; +/** @endcond */ + +#ifdef __SANITIZE_THREAD__ +static pthread_mutex_t tsan_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +/** Return the library version info. */ +char *__cold mdbx_version(int *major, int *minor, int *patch) { + if (major) + *major = MDB_VERSION_MAJOR; + if (minor) + *minor = MDB_VERSION_MINOR; + if (patch) + *patch = MDB_VERSION_PATCH; + return MDB_VERSION_STRING; +} + +/** Table of descriptions for LMDB @ref errors */ +static char *const mdbx_errstr[] = { + "MDB_KEYEXIST: Key/data pair already exists", + "MDB_NOTFOUND: No matching key/data pair found", + "MDB_PAGE_NOTFOUND: Requested page not found", + "MDB_CORRUPTED: Located page was wrong type", + "MDB_PANIC: Update of meta page failed or environment had fatal error", + "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_INVALID: File is not an LMDB file", + "MDB_MAP_FULL: Environment mapsize limit reached", + "MDB_DBS_FULL: Environment maxdbs limit reached", + "MDB_READERS_FULL: Environment maxreaders limit reached", + "MDB_TLS_FULL: Thread-local storage keys full - too many environments " + "open", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too " + "big", + "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong " + "DUPFIXED size", + "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", + "MDB_PROBLEM: Unexpected problem - txn should abort", +}; + +char *__cold mdbx_strerror(int err) { + int i; + if (!err) + return ("Successful return: 0"); + + if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { + i = err - MDB_KEYEXIST; + return mdbx_errstr[i]; + } + + return strerror(err); +} + +#if MDBX_MODE_ENABLED +static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); +#endif /* MDBX_MODE_ENABLED */ + +static void mdbx_debug_log(int type, const char *function, int line, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); + +#if MDB_DEBUG +static txnid_t mdbx_debug_edge; + +static void __cold mdbx_assert_fail(MDB_env *env, const char *msg, + const char *func, int line) { + if (env && env->me_assert_func) + env->me_assert_func(env, msg, func, line); + else { + if (mdbx_debug_logger) + mdbx_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); + __assert_fail(msg, __FILE__, line, func); + } +} + +#define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) + +#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) + +#define mdbx_debug_enabled(type) \ + unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) + +#else +#ifndef NDEBUG +#define mdbx_debug_enabled(type) (1) +#else +#define mdbx_debug_enabled(type) (0) +#endif +#define mdbx_audit_enabled() (0) +#define mdbx_assert_enabled() (0) +#define mdbx_assert_fail(env, msg, func, line) \ + __assert_fail(msg, __FILE__, line, func) +#endif /* MDB_DEBUG */ + +static void __cold mdbx_debug_log(int type, const char *function, int line, + const char *fmt, ...) { + va_list args; + + va_start(args, fmt); + if (mdbx_debug_logger) + mdbx_debug_logger(type, function, line, fmt, args); + else { + if (function && line > 0) + fprintf(stderr, "%s:%d ", function, line); + else if (function) + fprintf(stderr, "%s: ", function); + else if (line > 0) + fprintf(stderr, "%d: ", line); + vfprintf(stderr, fmt, args); + } + va_end(args); +} + +#define mdbx_print(fmt, ...) \ + mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) + +#define mdbx_debug(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_ensure_msg(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + mdbx_assert_fail(env, msg, __FUNCTION__, __LINE__); \ + } while (0) + +#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) + +/** assert(3) variant in environment context */ +#define mdbx_assert(env, expr) \ + do { \ + if (mdbx_assert_enabled()) \ + mdbx_ensure(env, expr); \ + } while (0) + +/** assert(3) variant in cursor context */ +#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) + +/** assert(3) variant in transaction context */ +#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) + +/** Return the page number of \b mp which may be sub-page, for debug output */ +static MDBX_INLINE pgno_t mdbx_dbg_pgno(MDB_page *mp) { + pgno_t ret; + COPY_PGNO(ret, mp->mp_pgno); + return ret; +} + +/** Display a key in hexadecimal and return the address of the result. + * @param[in] key the key to display + * @param[in] buf the buffer to write into. Should always be #DKBUF. + * @return The key in hexadecimal form. + */ +char *mdbx_dkey(MDB_val *key, char *buf) { + char *ptr = buf; + unsigned i; + + if (!key) + return ""; + + if (key->mv_size > DKBUF_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; +/* may want to make this a dynamic check: if the key is mostly + * printable characters, print it as-is instead of converting to hex. */ +#if 1 + buf[0] = '\0'; + for (i = 0; i < key->mv_size; i++) + ptr += sprintf(ptr, "%02x", ((unsigned char *)key->mv_data)[i]); +#else + sprintf(buf, "%.*s", key->mv_size, key->mv_data); +#endif + return buf; +} + +#if 0 /* LY: debug stuff */ +static const char * +mdbx_leafnode_type(MDB_node *n) +{ + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : + tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +} + +/** Display all the keys in the page. */ +static void +mdbx_page_list(MDB_page *mp) +{ + pgno_t pgno = mdbx_dbg_pgno(mp); + const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; + MDB_node *node; + unsigned i, nkeys, nsize, total = 0; + MDB_val key; + DKBUF; + + switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { + case P_BRANCH: type = "Branch page"; break; + case P_LEAF: type = "Leaf page"; break; + case P_LEAF|P_SUBP: type = "Sub-page"; break; + case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; + case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; + case P_OVERFLOW: + mdbx_print("Overflow page %zu pages %u%s\n", + pgno, mp->mp_pages, state); + return; + case P_META: + mdbx_print("Meta-page %zu txnid %zu\n", + pgno, ((MDB_meta *)PAGEDATA(mp))->mm_txnid); + return; + default: + mdbx_print("Bad page %zu flags 0x%X\n", pgno, mp->mp_flags); + return; } - return ret; -} - -static txnid_t __cold -mdbx_oomkick(MDB_env *env, txnid_t oldest) -{ - int retry; - txnid_t snap; - mdb_debug("DB size maxed out"); - - for(retry = 0; ; ++retry) { - int reader; - - if (mdb_reader_check(env, NULL)) - break; - - snap = mdb_find_oldest(env, &reader); - if (oldest < snap || reader < 0) { - if (retry && env->me_oom_func) { - /* LY: notify end of oom-loop */ - env->me_oom_func(env, 0, 0, oldest, snap - oldest, -retry); - } - return snap; - } - - MDB_reader *r; - pthread_t tid; - pid_t pid; - int rc; - - if (!env->me_oom_func) - break; - - r = &env->me_txns->mti_readers[ reader ]; - pid = r->mr_pid; - tid = r->mr_tid; - if (r->mr_txnid != oldest || pid <= 0) - continue; - - rc = env->me_oom_func(env, pid, (void*) tid, oldest, - mdb_meta_head_w(env)->mm_txnid - oldest, retry); - if (rc < 0) - break; - - if (rc) { - r->mr_txnid = ~(txnid_t)0; - if (rc > 1) { - r->mr_tid = 0; - r->mr_pid = 0; - mdbx_coherent_barrier(); - } - } - } - - if (retry && env->me_oom_func) { - /* LY: notify end of oom-loop */ - env->me_oom_func(env, 0, 0, oldest, 0, -retry); - } - return mdb_find_oldest(env, NULL); -} - -int __cold -mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) -{ - if (unlikely(!env)) - return EINVAL; - - if(unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - env->me_sync_threshold = bytes; - return env->me_map ? mdb_env_sync(env, 0) : MDB_SUCCESS; -} - -void __cold -mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) -{ - if (likely(env && env->me_signature == MDBX_ME_SIGNATURE)) - env->me_oom_func = oomfunc; -} - -MDBX_oom_func* __cold -mdbx_env_get_oomfunc(MDB_env *env) -{ - return likely(env && env->me_signature == MDBX_ME_SIGNATURE) - ? env->me_oom_func : NULL; -} - -ATTRIBUTE_NO_SANITIZE_THREAD /* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ -int mdbx_txn_straggler(MDB_txn *txn, int *percent) -{ - MDB_env *env; - MDB_meta *meta; - txnid_t lag; - - if(unlikely(!txn)) - return -EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(! txn->mt_u.reader)) - return -1; - - env = txn->mt_env; - meta = mdb_meta_head_r(env); - if (percent) { - size_t maxpg = env->me_maxpg; - size_t last = meta->mm_last_pg + 1; - if (env->me_txn) - last = env->me_txn0->mt_next_pgno; - *percent = (last * 100ull + maxpg / 2) / maxpg; - } - lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; - return (0 > (long) lag) ? ~0u >> 1: lag; -} - -typedef struct mdb_walk_ctx { - MDB_txn *mw_txn; - void *mw_user; - MDBX_pgvisitor_func *mw_visitor; -} mdb_walk_ctx_t; - -/** Depth-first tree traversal. */ -static int __cold -mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int deep) -{ - MDB_page *mp; - int rc, i, nkeys; - unsigned header_size, unused_size, payload_size, align_bytes; - const char* type; - - if (pg == P_INVALID) - return MDB_SUCCESS; /* empty db */ - - MDB_cursor mc; - memset(&mc, 0, sizeof(mc)); - mc.mc_snum = 1; - mc.mc_txn = ctx->mw_txn; - - rc = mdb_page_get(&mc, pg, &mp, NULL); - if (rc) - return rc; - if (pg != mp->mp_p.p_pgno) - return MDB_CORRUPTED; nkeys = NUMKEYS(mp); - header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; - unused_size = SIZELEFT(mp); - payload_size = 0; + mdbx_print("%s %zu numkeys %u%s\n", type, pgno, nkeys, state); - /* LY: Don't use mask here, e.g bitwise (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - switch (mp->mp_flags) { - case P_BRANCH: - type = "branch"; - if (nkeys < 1) - return MDB_CORRUPTED; - break; - case P_LEAF: - type = "leaf"; - break; - case P_LEAF|P_SUBP: - type = "dupsort-subleaf"; - break; - case P_LEAF|P_LEAF2: - type = "dupfixed-leaf"; - break; - case P_LEAF|P_LEAF2|P_SUBP: - type = "dupsort-dupfixed-subleaf"; - break; - case P_META: - case P_OVERFLOW: - default: - return MDB_CORRUPTED; - } - - for (align_bytes = i = 0; i < nkeys; - align_bytes += ((payload_size + align_bytes) & 1), i++) { - MDB_node *node; - - if (IS_LEAF2(mp)) { - /* LEAF2 pages have no mp_ptrs[] or node headers */ - payload_size += mp->mp_leaf2_ksize; + for (i=0; imp_leaf2_ksize; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + mdbx_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); continue; } - node = NODEPTR(mp, i); - payload_size += NODESIZE + node->mn_ksize; - + key.mv_size = node->mn_ksize; + key.mv_data = node->mn_data; + nsize = NODESIZE + key.mv_size; if (IS_BRANCH(mp)) { - rc = mdb_env_walk(ctx, dbi, NODEPGNO(node), deep); - if (rc) - return rc; - continue; - } - - assert(IS_LEAF(mp)); - if (node->mn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t *opg; - size_t over_header, over_payload, over_unused; - - payload_size += sizeof(pgno_t); - opg = NODEDATA(node); - rc = mdb_page_get(&mc, *opg, &omp, NULL); - if (rc) - return rc; - if (*opg != omp->mp_p.p_pgno) - return MDB_CORRUPTED; - /* LY: Don't use mask here, e.g bitwise (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - if (P_OVERFLOW != omp->mp_flags) - return MDB_CORRUPTED; - - over_header = PAGEHDRSZ; - over_payload = NODEDSZ(node); - over_unused = omp->mp_pages * ctx->mw_txn->mt_env->me_psize - - over_payload - over_header; - - rc = ctx->mw_visitor(*opg, omp->mp_pages, ctx->mw_user, dbi, - "overflow-data", 1, over_payload, over_header, over_unused); - if (rc) - return rc; - continue; - } - - payload_size += NODEDSZ(node); - if (node->mn_flags & F_SUBDATA) { - MDB_db *db = NODEDATA(node); - char* name = NULL; - - if (! (node->mn_flags & F_DUPDATA)) { - name = NODEKEY(node); - int namelen = (char*) db - name; - name = memcpy(alloca(namelen + 1), name, namelen); - name[namelen] = 0; - } - rc = mdb_env_walk(ctx, (name && name[0]) ? name : dbi, - db->md_root, deep + 1); - if (rc) - return rc; + mdbx_print("key %u: page %zu, %s\n", i, NODEPGNO(node), DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + total += nsize; + nsize += sizeof(indx_t); + mdbx_print("key %u: nsize %u, %s%s\n", + i, nsize, DKEY(&key), mdbx_leafnode_type(node)); } + total = EVEN(total); } - - return ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, type, - nkeys, payload_size, header_size, unused_size + align_bytes); + mdbx_print("Total: header %u + contents %u + unused %u\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); } -int __cold -mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func* visitor, void* user) +static void +mdbx_cursor_chk(MDB_cursor *mc) { - mdb_walk_ctx_t ctx; - int rc; - - if (unlikely(!txn)) - return MDB_BAD_TXN; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - ctx.mw_txn = txn; - ctx.mw_user = user; - ctx.mw_visitor = visitor; - - rc = visitor(0, 2, user, "lmdb", "meta", 2, sizeof(MDB_meta)*2, PAGEHDRSZ*2, - (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) *2); - if (! rc) - rc = mdb_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); - if (! rc) - rc = mdb_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0); - if (! rc) - rc = visitor(P_INVALID, 0, user, NULL, NULL, 0, 0, 0, 0); - return rc; -} - -int mdbx_canary_put(MDB_txn *txn, const mdbx_canary* canary) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EACCES; - - if (likely(canary)) { - txn->mt_canary.x = canary->x; - txn->mt_canary.y = canary->y; - txn->mt_canary.z = canary->z; - } - txn->mt_canary.v = txn->mt_txnid; - - return MDB_SUCCESS; -} - -size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary* canary) -{ - if(unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return 0; - - if (likely(canary)) - *canary = txn->mt_canary; - - return txn->mt_txnid; -} - -int mdbx_cursor_on_first(MDB_cursor *mc) -{ - if (unlikely(mc == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; - unsigned i; - for(i = 0; i < mc->mc_snum; ++i) { - if (mc->mc_ki[i]) - return MDBX_RESULT_FALSE; + MDB_node *node; + MDB_page *mp; + + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; + for (i=0; imc_top; i++) { + mp = mc->mc_pg[i]; + node = NODEPTR(mp, mc->mc_ki[i]); + if (unlikely(NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)) + mdbx_print("oops!\n"); } - - return MDBX_RESULT_TRUE; -} - -int mdbx_cursor_on_last(MDB_cursor *mc) -{ - if (unlikely(mc == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; - - unsigned i; - for(i = 0; i < mc->mc_snum; ++i) { - unsigned nkeys = NUMKEYS(mc->mc_pg[i]); - if (mc->mc_ki[i] < nkeys - 1) - return MDBX_RESULT_FALSE; + if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) + mdbx_print("ack!\n"); + if (XCURSOR_INITED(mc)) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && + mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { + mdbx_print("blah!\n"); + } } +} +#endif /* 0 */ - return MDBX_RESULT_TRUE; +/** Count all the pages in each DB and in the freelist + * and make sure it matches the actual number of pages + * being used. + * All named DBs must be open for a correct count. + */ +static void mdbx_audit(MDB_txn *txn) { + MDB_cursor mc; + MDB_val key, data; + MDB_ID freecount, count; + MDB_dbi i; + int rc; + + freecount = 0; + mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + mdbx_tassert(txn, rc == MDB_NOTFOUND); + + count = 0; + for (i = 0; i < txn->mt_numdbs; i++) { + MDB_xcursor mx; + if (!(txn->mt_dbflags[i] & DB_VALID)) + continue; + mdbx_cursor_init(&mc, txn, i, &mx); + if (txn->mt_dbs[i].md_root == P_INVALID) + continue; + count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { + rc = mdbx_page_search(&mc, NULL, MDB_PS_FIRST); + for (; rc == MDB_SUCCESS; rc = mdbx_cursor_sibling(&mc, 1)) { + unsigned j; + MDB_page *mp; + mp = mc.mc_pg[mc.mc_top]; + for (j = 0; j < NUMKEYS(mp); j++) { + MDB_node *leaf = NODEPTR(mp, j); + if (leaf->mn_flags & F_SUBDATA) { + MDB_db db; + memcpy(&db, NODEDATA(leaf), sizeof(db)); + count += + db.md_branch_pages + db.md_leaf_pages + db.md_overflow_pages; + } + } + } + mdbx_tassert(txn, rc == MDB_NOTFOUND); + } + } + if (freecount + count + NUM_METAS != txn->mt_next_pgno) { + mdbx_print( + "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", + txn->mt_txnid, freecount, count + NUM_METAS, + freecount + count + NUM_METAS, txn->mt_next_pgno); + } } -int mdbx_cursor_eof(MDB_cursor *mc) +int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { + mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + return txn->mt_dbxs[dbi].md_cmp(a, b); +} + +int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { + mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + return txn->mt_dbxs[dbi].md_dcmp(a, b); +} + +/** Allocate memory for a page. + * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. + */ +static MDB_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { + MDB_env *env = txn->mt_env; + size_t size = env->me_psize; + MDB_page *np = env->me_dpages; + if (likely(num == 1 && np)) { + ASAN_UNPOISON_MEMORY_REGION(np, size); + VALGRIND_MEMPOOL_ALLOC(env, np, size); + VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); + env->me_dpages = np->mp_next; + } else { + size *= num; + np = malloc(size); + if (unlikely(!np)) { + txn->mt_flags |= MDB_TXN_ERROR; + return np; + } + VALGRIND_MEMPOOL_ALLOC(env, np, size); + } + + if ((env->me_flags & MDB_NOMEMINIT) == 0) { + /* For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. */ + size_t skip = PAGEHDRSZ; + if (num > 1) + skip += (num - 1) * env->me_psize; + memset((char *)np + skip, 0, size - skip); + } + VALGRIND_MAKE_MEM_UNDEFINED(np, size); + np->mp_flags = 0; + np->mp_pages = num; + return np; +} + +/** Free a single page. + * Saves single pages to a list, for future reuse. + * (This is not used for multi-page overflow pages.) + */ +static MDBX_INLINE void mdbx_page_free(MDB_env *env, MDB_page *mp) { + mp->mp_next = env->me_dpages; + VALGRIND_MEMPOOL_FREE(env, mp); + env->me_dpages = mp; +} + +/** Free a dirty page */ +static void mdbx_dpage_free(MDB_env *env, MDB_page *dp) { + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdbx_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VALGRIND_MEMPOOL_FREE(env, dp); + free(dp); + } +} + +/** Return all dirty pages to dpage list */ +static void mdbx_dlist_free(MDB_txn *txn) { + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdbx_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; +} + +static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { + const size_t offs = env->me_psize * pgno; + const size_t shift = offsetof(MDB_page, mp_pb); + + if (env->me_flags & MDB_WRITEMAP) { + MDB_page *mp = (MDB_page *)(env->me_map + offs); + memset(&mp->mp_pb, 0x6F /* 'o', 111 */, env->me_psize - shift); + VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pb, env->me_psize - shift); + ASAN_POISON_MEMORY_REGION(&mp->mp_pb, env->me_psize - shift); + } else { + struct iovec iov[1]; + iov[0].iov_len = env->me_psize - shift; + iov[0].iov_base = alloca(iov[0].iov_len); + memset(iov[0].iov_base, 0x6F /* 'o', 111 */, iov[0].iov_len); + ssize_t rc = pwritev(env->me_fd, iov, 1, offs + shift); + assert(rc == (ssize_t)iov[0].iov_len); + (void)rc; + } +} + +/** Loosen or free a single page. + * Saves single pages to a list for future reuse + * in this same txn. It has been pulled from the freeDB + * and already resides on the dirty list, but has been + * deleted. Use these pages first before pulling again + * from the freeDB. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. + */ +static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { + int loose = 0; + pgno_t pgno = mp->mp_pgno; + MDB_txn *txn = mc->mc_txn; + + if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { + if (txn->mt_parent) { + MDB_ID2 *dl = txn->mt_u.dirty_list; + /* If txn has a parent, make sure the page is in our + * dirty list. */ + if (dl[0].mid) { + unsigned x = mdbx_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PROBLEM; + } + /* ok, it's ours */ + loose = 1; + } + } + } else { + /* no parent txn, so it's just ours */ + loose = 1; + } + } + if (loose) { + mdbx_debug("loosen db %d page %zu", DDBI(mc), mp->mp_pgno); + MDB_page **link = &NEXT_LOOSE_PAGE(mp); + if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { + mdbx_kill_page(txn->mt_env, pgno); + VALGRIND_MAKE_MEM_UNDEFINED(link, sizeof(MDB_page *)); + ASAN_UNPOISON_MEMORY_REGION(link, sizeof(MDB_page *)); + } + *link = txn->mt_loose_pgs; + txn->mt_loose_pgs = mp; + txn->mt_loose_count++; + mp->mp_flags |= P_LOOSE; + } else { + int rc = mdbx_midl_append(&txn->mt_free_pgs, pgno); + if (unlikely(rc)) + return rc; + } + + return MDB_SUCCESS; +} + +/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. + * @param[in] mc A cursor handle for the current operation. + * @param[in] pflags Flags of the pages to update: + * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * @param[in] all No shortcuts. Needed except after a full #mdbx_page_flush(). + * @return 0 on success, non-zero on failure. + */ +static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { + enum { Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3, *m0 = mc; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; + + /* Mark pages seen by cursors: First m0, then tracked cursors */ + for (i = txn->mt_numdbs;;) { + if (mc->mc_flags & C_INITIALIZED) { + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j = 0; j < m3->mc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + /* Proceed to mx if it is at a sub-database */ + if (!(mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (!(mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j - 1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + mc = mc->mc_next; + for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) + if (i == 0) + goto mark_done; + } + +mark_done: + if (all) { + /* Mark dirty root pages */ + for (i = 0; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if (unlikely((rc = mdbx_page_get(m0, pgno, &dp, &level)) != + MDB_SUCCESS)) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; +} + +static int mdbx_page_flush(MDB_txn *txn, int keep); + +/** Spill pages from the dirty list back to disk. + * This is intended to prevent running into #MDB_TXN_FULL situations, + * but note that they may still occur in a few cases: + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of #MDB_MULTIPLE items. + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during #mdbx_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of #mdbx_page_touch(). Such references are + * handled by #mdbx_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. + * + * @param[in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * @param[in] key For a put operation, the key being stored. + * @param[in] data For a put operation, the data being stored. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, j, need; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + /* Estimate how much space this op will take */ + i = m0->mc_db->md_depth; + /* Named DBs also dirty the main DB */ + if (m0->mc_dbi >= CORE_DBS) + i += txn->mt_dbs[MAIN_DBI].md_depth; + /* For puts, roughly factor in the key+data size */ + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; /* double it for good measure */ + need = i; + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX); + if (unlikely(!txn->mt_spill_pgs)) + return ENOMEM; + } else { + /* purge deleted slots */ + MDB_IDL sl = txn->mt_spill_pgs; + unsigned num = sl[0]; + j = 0; + for (i = 1; i <= num; i++) { + if (!(sl[i] & 1)) + sl[++j] = sl[i]; + } + sl[0] = j; + } + + /* Preserve pages which may soon be dirtied again */ + rc = mdbx_pages_xkeep(m0, P_DIRTY, 1); + if (unlikely(rc != MDB_SUCCESS)) + goto bailout; + + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. */ + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + + /* Save the page IDs of all the pages we're flushing */ + /* flush from the tail forward, this saves a lot of shifting later on. */ + for (i = dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & (P_LOOSE | P_KEEP)) + continue; + /* Can't spill twice, make sure it's not already in a parent's + * spill list. */ + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdbx_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + rc = mdbx_midl_append(&txn->mt_spill_pgs, pn); + if (unlikely(rc != MDB_SUCCESS)) + goto bailout; + need--; + } + mdbx_midl_sort(txn->mt_spill_pgs); + + /* Flush the spilled part of dirty list */ + rc = mdbx_page_flush(txn, i); + if (unlikely(rc != MDB_SUCCESS)) + goto bailout; + + /* Reset any dirty pages we kept that page_flush didn't see */ + rc = mdbx_pages_xkeep(m0, P_DIRTY | P_KEEP, i); + +bailout: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; +} + +static MDBX_INLINE uint64_t mdbx_meta_sign(MDB_meta *meta) { + uint64_t sign = MDB_DATASIGN_NONE; +#if 0 /* TODO */ + sign = hippeus_hash64( + &meta->mm_mapsize, + sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize), + meta->mm_version | (uint64_t) MDB_MAGIC << 32 + ); +#else + (void)meta; +#endif + /* LY: newer returns MDB_DATASIGN_NONE or MDB_DATASIGN_WEAK */ + return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; +} + +static MDBX_INLINE MDB_meta *mdbx_meta_head_w(MDB_env *env) { + MDB_meta *a = METAPAGE_1(env); + MDB_meta *b = METAPAGE_2(env); + txnid_t head_txnid = env->me_txns->mti_txnid; + + mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); + if (a->mm_txnid == head_txnid) + return a; + if (likely(b->mm_txnid == head_txnid)) + return b; + + mdbx_debug("me_txns->mti_txnid not match meta-pages"); + mdbx_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid); + env->me_flags |= MDB_FATAL_ERROR; + return a; +} + +static MDB_meta *mdbx_meta_head_r(MDB_env *env) { + MDB_meta *a = METAPAGE_1(env); + MDB_meta *b = METAPAGE_2(env), *h; + +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + + txnid_t head_txnid = env->me_txns->mti_txnid; + mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); + if (likely(a->mm_txnid == head_txnid)) { + h = a; + } else if (likely(b->mm_txnid == head_txnid)) { + h = b; + } else { + /* LY: seems got a collision with mdbx_env_sync0() */ + mdbx_coherent_barrier(); + head_txnid = env->me_txns->mti_txnid; + mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); + + if (likely(a->mm_txnid == head_txnid)) { + h = a; + } else if (likely(b->mm_txnid == head_txnid)) { + h = b; + } else { + /* LY: got a race again, or DB is corrupted */ + int rc = mdbx_mutex_lock(env, MDB_MUTEX(env, w)); + h = mdbx_meta_head_w(env); + if (rc == 0) + mdbx_mutex_unlock(env, MDB_MUTEX(env, w)); + } + } + +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + + return h; +} + +static MDBX_INLINE MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, + MDB_meta *meta) { + return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); +} + +static MDBX_INLINE int mdbx_meta_lt(MDB_meta *a, MDB_meta *b) { + return (META_IS_STEADY(a) == META_IS_STEADY(b)) ? a->mm_txnid < b->mm_txnid + : META_IS_STEADY(b); +} + +/** Find oldest txnid still referenced. */ +static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + int i, reader; + MDB_reader *r = env->me_txns->mti_readers; + txnid_t oldest = env->me_txns->mti_txnid; + + MDB_meta *a = METAPAGE_1(env); + MDB_meta *b = METAPAGE_2(env); + if (META_IS_WEAK(a) && oldest > b->mm_txnid) + oldest = b->mm_txnid; + if (META_IS_WEAK(b) && oldest > a->mm_txnid) + oldest = a->mm_txnid; + + for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0;) { + if (r[i].mr_pid) { + txnid_t snap = r[i].mr_txnid; + if (oldest > snap) { + oldest = snap; + reader = i; + } + } + } +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + + if (laggard) + *laggard = reader; + return env->me_pgoldest = oldest; +} + +/** Add a page to the txn's dirty list */ +static void mdbx_page_dirty(MDB_txn *txn, MDB_page *mp) { + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + insert = mdbx_mid2l_append; + } else { + insert = mdbx_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdbx_tassert(txn, rc == 0); + txn->mt_dirty_room--; +} + +/** Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. + * + * If there are free pages available from older transactions, they + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_pghead[] + * and move me_pglast to say which records were consumed. Only this + * function can create me_pghead and move me_pglast/mt_next_pgno. + * @param[in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * @param[in] num the number of pages to allocate. + * @param[out] mp Address of the allocated page(s). Requests for multiple + *pages + * will always be satisfied by a single contiguous chunk of memory. + * @return 0 on success, non-zero on failure. + */ + +#define MDBX_ALLOC_CACHE 1 +#define MDBX_ALLOC_GC 2 +#define MDBX_ALLOC_NEW 4 +#define MDBX_ALLOC_KICK 8 +#define MDBX_ALLOC_ALL \ + (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW | MDBX_ALLOC_KICK) + +static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { + int rc; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num - 1; + MDB_page *np; + txnid_t oldest = 0, last = 0; + MDB_cursor_op op; + MDB_cursor m2; + int found_oldest = 0; + + if (likely(flags & MDBX_ALLOC_GC)) { + flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); + if (unlikely(mc->mc_flags & C_RECLAIMING)) { + /* If mc is updating the freeDB, then the freelist cannot play + * catch-up with itself by growing while trying to save it. */ + flags &= + ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); + } + } + + if (likely(flags & MDBX_ALLOC_CACHE)) { + /* If there are any loose pages, just use them */ + assert(mp && num); + if (likely(num == 1 && txn->mt_loose_pgs)) { + np = txn->mt_loose_pgs; + txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + txn->mt_loose_count--; + mdbx_debug("db %d use loose page %zu", DDBI(mc), np->mp_pgno); + ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); + *mp = np; + return MDB_SUCCESS; + } + } + + /* If our dirty list is already full, we can't do anything */ + if (unlikely(txn->mt_dirty_room == 0)) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (;;) { /* oom-kick retry loop */ + for (op = MDB_FIRST;; + op = (flags & MDBX_LIFORECLAIM) ? MDB_PREV : MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl; + + /* Seek a big enough contiguous page range. Prefer + * pages at the tail, just truncating the list. */ + if (likely(flags & MDBX_ALLOC_CACHE) && mop_len > n2 && + (!(flags & MDBX_COALESCE) || op == MDB_FIRST)) { + i = mop_len; + do { + pgno = mop[i]; + if (likely(mop[i - n2] == pgno + n2)) + goto done; + } while (--i > n2); + } + + if (op == MDB_FIRST) { /* 1st iteration */ + /* Prepare to fetch more and coalesce */ + if (unlikely(!(flags & MDBX_ALLOC_GC))) + break; + + oldest = env->me_pgoldest; + mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); + if (flags & MDBX_LIFORECLAIM) { + if (!found_oldest) { + oldest = mdbx_find_oldest(env, NULL); + found_oldest = 1; + } + /* Begin from oldest reader if any */ + if (oldest > 2) { + last = oldest - 1; + op = MDB_SET_RANGE; + } + } else if (env->me_pglast) { + /* Continue lookup from env->me_pglast to higher/last */ + last = env->me_pglast; + op = MDB_SET_RANGE; + } + + key.mv_data = &last; + key.mv_size = sizeof(last); + } + + if (!(flags & MDBX_LIFORECLAIM)) { + /* Do not fetch more if the record will be too recent */ + if (op != MDB_FIRST && ++last >= oldest) { + if (!found_oldest) { + oldest = mdbx_find_oldest(env, NULL); + found_oldest = 1; + } + if (oldest <= last) + break; + } + } + + rc = mdbx_cursor_get(&m2, &key, NULL, op); + if (rc == MDB_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { + if (op == MDB_SET_RANGE) + continue; + found_oldest = 1; + if (oldest < mdbx_find_oldest(env, NULL)) { + oldest = env->me_pgoldest; + last = oldest - 1; + key.mv_data = &last; + key.mv_size = sizeof(last); + op = MDB_SET_RANGE; + rc = mdbx_cursor_get(&m2, &key, NULL, op); + } + } + if (unlikely(rc)) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + + last = *(txnid_t *)key.mv_data; + if (oldest <= last) { + if (!found_oldest) { + oldest = mdbx_find_oldest(env, NULL); + found_oldest = 1; + } + if (oldest <= last) { + if (flags & MDBX_LIFORECLAIM) + continue; + break; + } + } + + if (flags & MDBX_LIFORECLAIM) { + if (txn->mt_lifo_reclaimed) { + for (j = txn->mt_lifo_reclaimed[0]; j > 0; --j) + if (txn->mt_lifo_reclaimed[j] == last) + break; + if (j) + continue; + } + } + + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if (unlikely((rc = mdbx_node_read(&m2, leaf, &data)) != MDB_SUCCESS)) + goto fail; + + if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { + txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); + if (unlikely(!txn->mt_lifo_reclaimed)) { + rc = ENOMEM; + goto fail; + } + } + + idl = (MDB_ID *)data.mv_data; + mdbx_tassert(txn, idl[0] == 0 || + data.mv_size == (idl[0] + 1) * sizeof(MDB_ID)); + i = idl[0]; + if (!mop) { + if (unlikely(!(env->me_pghead = mop = mdbx_midl_alloc(i)))) { + rc = ENOMEM; + goto fail; + } + } else { + if (unlikely((rc = mdbx_midl_need(&env->me_pghead, i)) != 0)) + goto fail; + mop = env->me_pghead; + } + if (flags & MDBX_LIFORECLAIM) { + if ((rc = mdbx_midl_append(&txn->mt_lifo_reclaimed, last)) != 0) + goto fail; + } + env->me_pglast = last; + + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { + mdbx_debug_extra("IDL read txn %zu root %zu num %u, IDL", last, + txn->mt_dbs[FREE_DBI].md_root, i); + for (j = i; j; j--) + mdbx_debug_extra_print(" %zu", idl[j]); + mdbx_debug_extra_print("\n"); + } + + /* Merge in descending sorted order */ + mdbx_midl_xmerge(mop, idl); + mop_len = mop[0]; + + if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { + /* force gc reclaim mode */ + return MDB_SUCCESS; + } + + /* Don't try to coalesce too much. */ + if (mop_len > MDB_IDL_UM_SIZE / 2) + break; + if (flags & MDBX_COALESCE) { + if (mop_len /* current size */ >= env->me_maxfree_1pg / 2 || + i /* prev size */ >= env->me_maxfree_1pg / 4) + flags &= ~MDBX_COALESCE; + } + } + + if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == + (MDBX_COALESCE | MDBX_ALLOC_CACHE) && + mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i - n2] == pgno + n2) + goto done; + } while (--i > n2); + } + + /* Use new pages from the map when nothing suitable in the freeDB */ + i = 0; + pgno = txn->mt_next_pgno; + rc = MDB_MAP_FULL; + if (likely(pgno + num <= env->me_maxpg)) { + rc = MDB_NOTFOUND; + if (likely(flags & MDBX_ALLOC_NEW)) + goto done; + } + + if ((flags & MDBX_ALLOC_GC) && + ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) { + MDB_meta *head = mdbx_meta_head_w(env); + MDB_meta *tail = mdbx_env_meta_flipflop(env, head); + + if (oldest == tail->mm_txnid && META_IS_WEAK(head) && + !META_IS_WEAK(tail)) { + MDB_meta meta = *head; + /* LY: Here an oom was happened: + * - all pages had allocated; + * - reclaiming was stopped at the last steady-sync; + * - the head-sync is weak. + * Now we need make a sync to resume reclaiming. If both + * MDB_NOSYNC and MDB_MAPASYNC flags are set, then assume that + * utterly no-sync write mode was requested. In such case + * don't make a steady-sync, but only a legacy-mode checkpoint, + * just for resume reclaiming only, not for data consistency. */ + + mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu", + head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', + tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest, + env->me_txns->mt1.mtb.mtb_txnid); + + int flags = env->me_flags & MDB_WRITEMAP; + if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) + flags |= MDBX_UTTERLY_NOSYNC; + + mdbx_assert(env, env->me_sync_pending > 0); + if (mdbx_env_sync0(env, flags, &meta) == MDB_SUCCESS) { + txnid_t snap = mdbx_find_oldest(env, NULL); + if (snap > oldest) { + continue; + } + } + } + + if (rc == MDB_MAP_FULL) { +#if MDBX_MODE_ENABLED + txnid_t snap = mdbx_oomkick(env, oldest); +#else + mdbx_debug("DB size maxed out"); + txnid_t snap = mdbx_find_oldest(env, NULL); +#endif /* MDBX_MODE_ENABLED */ + if (snap > oldest) { + oldest = snap; + continue; + } + } + } + + fail: + if (mp) { + *mp = NULL; + txn->mt_flags |= MDB_TXN_ERROR; + } + assert(rc); + return rc; + } + +done: + assert(mp && num); + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + /* LY: reset no-access flag from mdbx_kill_page() */ + VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); + ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize * num); + } else { + if (unlikely(!(np = mdbx_page_malloc(txn, num)))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i - num; j < mop_len;) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + + if (env->me_flags & MDBX_PAGEPERTURB) + memset(np, 0x71 /* 'q', 113 */, env->me_psize * num); + VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); + + np->mp_pgno = pgno; + np->mp_leaf2_ksize = 0; + np->mp_flags = 0; + np->mp_pages = num; + mdbx_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; +} + +/** Copy the used portions of a non-overflow page. + * @param[in] dst page to copy into + * @param[in] src page to copy from + * @param[in] psize size of a page + */ +static void mdbx_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) { + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. */ + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper = (upper + PAGEBASE) & -Align; + memcpy(dst, src, (lower + PAGEBASE + (Align - 1)) & -Align); + memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } +} + +/** Pull a page off the txn's spill list, if present. + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. + * @param[in] txn the transaction handle. + * @param[in] mp the page being referenced. It must not be dirty. + * @param[out] ret the writable page, if any. ret is unchanged if + * mp wasn't spilled. + */ +static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2 = tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdbx_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdbx_page_malloc(txn, num); + if (unlikely(!np)) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdbx_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. */ + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits + */ + + mdbx_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; +} + +/** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc cursor pointing to the page to be touched + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_touch(MDB_cursor *mc) { + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; + + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdbx_page_unspill(txn, mp, &np); + if (unlikely(rc)) + goto fail; + if (likely(np)) + goto done; + } + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) + goto fail; + pgno = np->mp_pgno; + mdbx_debug("touched db %d page %zu -> %zu", DDBI(mc), mp->mp_pgno, pgno); + mdbx_cassert(mc, mp->mp_pgno != pgno); + mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top - 1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top - 1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + /* If txn has a parent, make sure the page is in our + * dirty list. */ + if (dl[0].mid) { + unsigned x = mdbx_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PROBLEM; + } + return 0; + } + } + mdbx_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + /* No - copy it */ + np = mdbx_page_malloc(txn, 1); + if (unlikely(!np)) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdbx_mid2l_insert(dl, &mid); + mdbx_cassert(mc, rc == 0); + } else { + return 0; + } + + mdbx_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + +done: + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2 = m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2 = m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) + continue; + if (m2 == mc) + continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if (XCURSOR_INITED(m2) && IS_LEAF(np)) + XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); + } + } + } + return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_env_sync(MDB_env *env, int force) { + int rc; + pthread_mutex_t *mutex; + MDB_meta *head; + unsigned flags; + + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!env->me_txns)) + return MDB_PANIC; + + flags = env->me_flags & ~MDB_NOMETASYNC; + if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) + return EACCES; + + head = mdbx_meta_head_r(env); + if (!META_IS_WEAK(head) && env->me_sync_pending == 0 && + env->me_mapsize == head->mm_mapsize) + /* LY: nothing to do */ + return MDB_SUCCESS; + + if (force || head->mm_mapsize != env->me_mapsize || + (env->me_sync_threshold && + env->me_sync_pending >= env->me_sync_threshold)) + flags &= MDB_WRITEMAP; + + /* LY: early sync before acquiring the mutex to reduce writer's latency */ + if (env->me_sync_pending > env->me_psize * 16 && (flags & MDB_NOSYNC) == 0) { + if (flags & MDB_WRITEMAP) { + size_t used_size = env->me_psize * (head->mm_last_pg + 1); + rc = msync(env->me_map, used_size, + (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC); + } else { + rc = fdatasync(env->me_fd); + } + if (unlikely(rc)) + return errno; + } + + mutex = MDB_MUTEX(env, w); + rc = mdbx_mutex_lock(env, mutex); + if (unlikely(rc)) + return rc; + + /* LY: head may be changed while the mutex has been acquired. */ + head = mdbx_meta_head_w(env); + rc = MDB_SUCCESS; + if (META_IS_WEAK(head) || env->me_sync_pending != 0 || + env->me_mapsize != head->mm_mapsize) { + MDB_meta meta = *head; + rc = mdbx_env_sync0(env, flags, &meta); + } + + mdbx_mutex_unlock(env, mutex); + return rc; +} + +/** Back up parent txn's cursors, then grab the originals for tracking */ +static int mdbx_cursor_shadow(MDB_txn *src, MDB_txn *dst) { + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0;) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (unlikely(!bk)) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk + 1) = *mx; + mx->mx_cursor.mc_txn = dst; + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; + } + } + } + return MDB_SUCCESS; +} + +/** Close this write txn's cursors, give parent txn's cursors back to parent. + * @param[in] txn the transaction handle. + * @param[in] merge true to keep changes to parent cursors, false to revert. + * @return 0 on success, non-zero on failure. + */ +static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0;) { + for (mc = cursors[i]; mc; mc = next) { + unsigned stage = mc->mc_signature; + mdbx_ensure(NULL, + stage == MDBX_MC_SIGNATURE || stage == MDBX_MC_WAIT4EOT); + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + /* Commit changes to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + /* Abort nested txn */ + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk + 1); + } +#if MDBX_MODE_ENABLED + bk->mc_signature = 0; + free(bk); + } + if (stage == MDBX_MC_WAIT4EOT) { + mc->mc_signature = 0; + free(mc); + } else { + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0 /* reset C_UNTRACK */; + } +#else + mc = bk; + } + /* Only malloced cursors are permanently tracked. */ + mc->mc_signature = 0; + free(mc); +#endif + } + cursors[i] = NULL; + } +} + +/** Set or check a pid lock. Set returns 0 on success. + * Check returns 0 if the process is certainly dead, nonzero if it may + * be alive (the lock exists or an error happened so we do not know). + */ +static int mdbx_reader_pid(MDB_env *env, int op, pid_t pid) { + for (;;) { + int rc; + struct flock lock_info; + memset(&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = pid; + lock_info.l_len = 1; + if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { + if (op == F_GETLK && lock_info.l_type != F_UNLCK) + rc = -1; + } else if ((rc = errno) == EINTR) { + continue; + } + return rc; + } +} + +/** Common code for #mdbx_txn_begin() and #mdbx_txn_renew(). + * @param[in] txn the transaction handle to initialize + * @return 0 on success, non-zero on failure. + */ +static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { + MDB_env *env = txn->mt_env; + unsigned i, nr; + int rc, new_notls = 0; + + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + if (flags & MDB_TXN_RDONLY) { + MDBX_rthc *rthc = NULL; + MDB_reader *r = NULL; + + txn->mt_flags = MDB_TXN_RDONLY; + if (likely(env->me_flags & MDB_ENV_TXKEY)) { + mdbx_assert(env, !(env->me_flags & MDB_NOTLS)); + rthc = mdbx_rthc_get(env->me_txkey); + if (unlikely(!rthc)) + return ENOMEM; + if (likely(rthc->rc_reader)) { + r = rthc->rc_reader; + mdbx_assert(env, r->mr_pid == env->me_pid); + mdbx_assert(env, r->mr_tid == pthread_self()); + } + } else { + mdbx_assert(env, env->me_flags & MDB_NOTLS); + r = txn->mt_u.reader; + } + + if (likely(r)) { + if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) + return MDB_BAD_RSLOT; + } else { + pid_t pid = env->me_pid; + pthread_t tid = pthread_self(); + pthread_mutex_t *rmutex = MDB_MUTEX(env, r); + + rc = mdbx_mutex_lock(env, rmutex); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + if (unlikely(!env->me_live_reader)) { + rc = mdbx_reader_pid(env, F_SETLK, pid); + if (unlikely(rc != MDB_SUCCESS)) { + mdbx_mutex_unlock(env, rmutex); + return rc; + } + env->me_live_reader = 1; + } + + nr = env->me_txns->mti_numreaders; + for (i = 0; i < nr; i++) + if (env->me_txns->mti_readers[i].mr_pid == 0) + break; + if (unlikely(i == env->me_maxreaders)) { + mdbx_mutex_unlock(env, rmutex); + return MDB_READERS_FULL; + } + r = &env->me_txns->mti_readers[i]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in mti_numreaders. After + * that, it is safe for mdbx_env_close() to touch it. + * When it will be closed, we can finally claim it. */ + r->mr_pid = 0; + r->mr_txnid = ~(txnid_t)0; + r->mr_tid = tid; + mdbx_coherent_barrier(); +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + if (i == nr) + env->me_txns->mti_numreaders = ++nr; + if (env->me_close_readers < nr) + env->me_close_readers = nr; + r->mr_pid = pid; +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + mdbx_mutex_unlock(env, rmutex); + + new_notls = MDB_END_SLOT; + if (likely(rthc)) { + rthc->rc_reader = r; + new_notls = 0; + } + } + + while ((env->me_flags & MDB_FATAL_ERROR) == 0) { + MDB_meta *meta = mdbx_meta_head_r(txn->mt_env); + txnid_t lead = meta->mm_txnid; + r->mr_txnid = lead; + mdbx_coherent_barrier(); + + txnid_t snap = txn->mt_env->me_txns->mti_txnid; + /* LY: Retry on a race, ITS#7970. */ + if (likely(lead == snap)) { + txn->mt_txnid = lead; + txn->mt_next_pgno = meta->mm_last_pg + 1; + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); +#if MDBX_MODE_ENABLED + txn->mt_canary = meta->mm_canary; +#endif + break; + } + } + + txn->mt_u.reader = r; + txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + } else { + /* Not yet touching txn == env->me_txn0, it may be active */ + rc = mdbx_mutex_lock(env, MDB_MUTEX(env, w)); + if (unlikely(rc)) + return rc; + +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + MDB_meta *meta = mdbx_meta_head_w(env); +#if MDBX_MODE_ENABLED + txn->mt_canary = meta->mm_canary; +#endif + txn->mt_txnid = meta->mm_txnid + 1; + txn->mt_flags = flags; +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + +#if MDB_DEBUG + if (unlikely(txn->mt_txnid == mdbx_debug_edge)) { + if (!mdbx_debug_logger) + mdbx_runtime_flags |= + MDBX_DBG_TRACE | MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_ASSERT; + mdbx_debug_log(MDBX_DBG_EDGE, __FUNCTION__, __LINE__, + "on/off edge (txn %zu)", txn->mt_txnid); + } +#endif + txn->mt_child = NULL; + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + if (txn->mt_lifo_reclaimed) + txn->mt_lifo_reclaimed[0] = 0; + env->me_txn = txn; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned)); + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + /* Moved to here to avoid a data race in read TXNs */ + txn->mt_next_pgno = meta->mm_last_pg + 1; + } + + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + unsigned x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = + (x & MDB_VALID) ? DB_VALID | DB_USRVALID | DB_STALE : 0; + } + txn->mt_dbflags[MAIN_DBI] = DB_VALID | DB_USRVALID; + txn->mt_dbflags[FREE_DBI] = DB_VALID; + + if (unlikely(env->me_flags & MDB_FATAL_ERROR)) { + mdbx_debug("environment had fatal error, must shutdown!"); + rc = MDB_PANIC; + } else if (unlikely(env->me_maxpg < txn->mt_next_pgno)) { + rc = MDB_MAP_RESIZED; + } else { + return MDB_SUCCESS; + } + mdbx_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + return rc; +} + +int mdbx_txn_renew(MDB_txn *txn) { + int rc; + + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY | MDB_TXN_FINISHED))) + return EINVAL; + + rc = mdbx_txn_renew0(txn, MDB_TXN_RDONLY); + if (rc == MDB_SUCCESS) { + mdbx_debug("renew txn %zu%c %p on mdbenv %p, root page %zu", txn->mt_txnid, + (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + } + return rc; +} + +int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, + MDB_txn **ret) { + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize; + + if (unlikely(!env || !ret)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + flags &= MDB_TXN_BEGIN_FLAGS; + flags |= env->me_flags & MDB_WRITEMAP; + + if (unlikely(env->me_flags & MDB_RDONLY & + ~flags)) /* write txn in RDONLY env */ + return EACCES; + + if (parent) { + if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) + return EINVAL; + + /* Nested transactions: Max 1 child, write txns only, no writemap */ + flags |= parent->mt_flags; + if (unlikely(flags & (MDB_RDONLY | MDB_WRITEMAP | MDB_TXN_BLOCKED))) { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + /* Child txns save MDB_pgstate and use own copy of cursors */ + size = env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + 1); + size += tsize = sizeof(MDB_ntxn); + } else if (flags & MDB_RDONLY) { + size = env->me_maxdbs * (sizeof(MDB_db) + 1); + size += tsize = sizeof(MDB_txn); + } else { + /* Reuse preallocated write txn. However, do not touch it until + * mdbx_txn_renew0() succeeds, since it currently may be active. */ + txn = env->me_txn0; + goto renew; + } + if (unlikely((txn = calloc(1, size)) == NULL)) { + mdbx_debug("calloc: %s", strerror(errno)); + return ENOMEM; + } + txn->mt_dbxs = env->me_dbxs; /* static */ + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; + txn->mt_flags = flags; + txn->mt_env = env; + + if (parent) { + unsigned i; + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2) * MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX))) { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_flags |= MDB_TXN_HAS_CHILD; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + /* Copy parent's mt_dbflags, but clear DB_NEW */ + for (i = 0; i < txn->mt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdbx_midl_alloc(env->me_pghead[0]); + if (likely(env->me_pghead)) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (likely(!rc)) + rc = mdbx_cursor_shadow(parent, txn); + if (unlikely(rc)) + mdbx_txn_end(txn, MDB_END_FAIL_BEGINCHILD); + } else { /* MDB_RDONLY */ + txn->mt_dbiseqs = env->me_dbiseqs; + renew: + rc = mdbx_txn_renew0(txn, flags); + } + if (unlikely(rc)) { + if (txn != env->me_txn0) + free(txn); + } else { + txn->mt_signature = MDBX_MT_SIGNATURE; + *ret = txn; + mdbx_debug("begin txn %zu%c %p on mdbenv %p, root page %zu", txn->mt_txnid, + (flags & MDB_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, + txn->mt_dbs[MAIN_DBI].md_root); + } + + return rc; +} + +MDB_env *mdbx_txn_env(MDB_txn *txn) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return NULL; + return txn->mt_env; +} + +size_t mdbx_txn_id(MDB_txn *txn) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return 0; + return txn->mt_txnid; +} + +/** Export or close DBI handles opened in this txn. */ +static void mdbx_dbis_update(MDB_txn *txn, int keep) { + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= CORE_DBS;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + if (ptr) { + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + env->me_dbiseqs[i]++; + free(ptr); + } + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; +} + +/** End a transaction, except successful commit of a nested transaction. + * May be called twice for readonly txns: First reset it, then abort. + * @param[in] txn the transaction handle to end + * @param[in] mode why and how to end the transaction + */ +static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { + MDB_env *env = txn->mt_env; + static const char *const names[] = MDB_END_NAMES; + + if (unlikely(txn->mt_env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + /* Export or close DBI handles opened in this txn */ + mdbx_dbis_update(txn, mode & MDB_END_UPDATE); + + mdbx_debug("%s txn %zu%c %p on mdbenv %p, root page %zu", + names[mode & MDB_END_OPMASK], txn->mt_txnid, + (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + txn->mt_u.reader->mr_txnid = ~(txnid_t)0; + if (!(env->me_flags & MDB_NOTLS)) { + txn->mt_u.reader = NULL; /* txn does not own reader */ + } else if (mode & MDB_END_SLOT) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } /* else txn owns the slot until it does MDB_END_SLOT */ +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + } + mdbx_coherent_barrier(); + txn->mt_numdbs = 0; /* prevent further DBI activity */ + txn->mt_flags |= MDB_TXN_FINISHED; + + } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { + pgno_t *pghead = env->me_pghead; + + if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ + mdbx_cursors_eot(txn, 0); + if (!(env->me_flags & MDB_WRITEMAP)) { + mdbx_dlist_free(txn); + } + + if (txn->mt_lifo_reclaimed) { + txn->mt_lifo_reclaimed[0] = 0; + if (txn != env->me_txn0) { + mdbx_midl_free(txn->mt_lifo_reclaimed); + txn->mt_lifo_reclaimed = NULL; + } + } + txn->mt_numdbs = 0; + txn->mt_flags = MDB_TXN_FINISHED; + + if (!txn->mt_parent) { + mdbx_midl_shrink(&txn->mt_free_pgs); + env->me_free_pgs = txn->mt_free_pgs; + /* me_pgstate: */ + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + mode = 0; /* txn == env->me_txn0, do not free() it */ + + /* The writer mutex was locked in mdbx_txn_begin. */ + mdbx_mutex_unlock(env, MDB_MUTEX(env, w)); + } else { + txn->mt_parent->mt_child = NULL; + txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdbx_midl_free(txn->mt_free_pgs); + mdbx_midl_free(txn->mt_spill_pgs); + free(txn->mt_u.dirty_list); + } + + mdbx_midl_free(pghead); + } + + if (mode & MDB_END_FREE) { + txn->mt_signature = 0; + free(txn); + } + + return MDB_SUCCESS; +} + +int mdbx_txn_reset(MDB_txn *txn) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + /* This call is only valid for read-only txns */ + if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) + return EINVAL; + +#if MDBX_MODE_ENABLED + /* LY: don't close DBI-handles in MDBX mode */ + return mdbx_txn_end(txn, MDB_END_RESET | MDB_END_UPDATE); +#else + return mdbx_txn_end(txn, MDB_END_RESET); +#endif /* MDBX_MODE_ENABLED */ +} + +int mdbx_txn_abort(MDB_txn *txn) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + +#if MDBX_MODE_ENABLED + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + /* LY: don't close DBI-handles in MDBX mode */ + return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_UPDATE | MDB_END_SLOT | + MDB_END_FREE); +#endif /* MDBX_MODE_ENABLED */ + + if (txn->mt_child) + mdbx_txn_abort(txn->mt_child); + + return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_SLOT | MDB_END_FREE); +} + +static MDBX_INLINE int mdbx_backlog_size(MDB_txn *txn) { + int reclaimed = txn->mt_env->me_pghead ? txn->mt_env->me_pghead[0] : 0; + return reclaimed + txn->mt_loose_count; +} + +/* LY: Prepare a backlog of pages to modify FreeDB itself, + * while reclaiming is prohibited. It should be enough to prevent search + * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ +static int mdbx_prep_backlog(MDB_txn *txn, MDB_cursor *mc) { + /* LY: extra page(s) for b-tree rebalancing */ + const int extra = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1; + + if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { + int rc = mdbx_cursor_touch(mc); + if (unlikely(rc)) + return rc; + + while (unlikely(mdbx_backlog_size(txn) < extra)) { + rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); + if (unlikely(rc)) { + if (unlikely(rc != MDB_NOTFOUND)) + return rc; + break; + } + } + } + + return MDB_SUCCESS; +} + +/** Save the freelist as of this transaction to the freeDB. + * This changes the freelist. Keep trying until it stabilizes. + */ +static int mdbx_freelist_save(MDB_txn *txn) { + /* env->me_pghead[] can grow and shrink during this call. + * env->me_pglast and txn->mt_free_pgs[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pgs[]. */ + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + unsigned cleanup_idx = 0, refill_idx = 0; + const int lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; + + mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDB_NOMEMINIT | MDB_WRITEMAP)) ? SSIZE_MAX + : maxfree_1pg; + +again: + for (;;) { + /* Come back here after each Put() in case freelist changed */ + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + if (!lifo) { + /* If using records from freeDB which we have not yet + * deleted, delete them and any we reserved for me_pghead. */ + while (pglast < env->me_pglast) { + rc = mdbx_cursor_first(&mc, &key, NULL); + if (unlikely(rc)) + goto bailout; + rc = mdbx_prep_backlog(txn, &mc); + if (unlikely(rc)) + goto bailout; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + more = 1; + mdbx_tassert(txn, pglast <= env->me_pglast); + mc.mc_flags |= C_RECLAIMING; + rc = mdbx_cursor_del(&mc, 0); + mc.mc_flags &= ~C_RECLAIMING; + if (unlikely(rc)) + goto bailout; + } + } else if (txn->mt_lifo_reclaimed) { + /* LY: cleanup reclaimed records. */ + while (cleanup_idx < txn->mt_lifo_reclaimed[0]) { + pglast = txn->mt_lifo_reclaimed[++cleanup_idx]; + key.mv_data = &pglast; + key.mv_size = sizeof(pglast); + rc = mdbx_cursor_get(&mc, &key, NULL, MDB_SET); + if (likely(rc != MDB_NOTFOUND)) { + if (unlikely(rc)) + goto bailout; + rc = mdbx_prep_backlog(txn, &mc); + if (unlikely(rc)) + goto bailout; + mc.mc_flags |= C_RECLAIMING; + rc = mdbx_cursor_del(&mc, 0); + mc.mc_flags &= ~C_RECLAIMING; + if (unlikely(rc)) + goto bailout; + } + } + } + + if (unlikely(!env->me_pghead) && txn->mt_loose_pgs) { + /* Put loose page numbers in mt_free_pgs, since + * we may be unable to return them to me_pghead. */ + MDB_page *mp = txn->mt_loose_pgs; + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, + txn->mt_loose_count)) != 0)) + return rc; + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) + mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + } + + /* Save the IDL of pages freed by this txn, to a single record */ + if (freecnt < txn->mt_free_pgs[0]) { + if (unlikely(!freecnt)) { + /* Make sure last page of freeDB is touched and on freelist */ + rc = mdbx_page_search(&mc, NULL, MDB_PS_LAST | MDB_PS_MODIFY); + if (unlikely(rc && rc != MDB_NOTFOUND)) + goto bailout; + } + free_pgs = txn->mt_free_pgs; + /* Write to last page of freeDB */ + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (unlikely(rc)) + goto bailout; + /* Retry if mt_free_pgs[] grew during the Put() */ + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + + mdbx_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); + + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { + unsigned i = free_pgs[0]; + mdbx_debug_extra("IDL write txn %zu root %zu num %u, IDL", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); + for (; i; i--) + mdbx_debug_extra_print(" %zu", free_pgs[i]); + mdbx_debug_extra_print("\n"); + } + continue; + } + + mop = env->me_pghead; + mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; + + if (mop_len && refill_idx == 0) + refill_idx = 1; + + /* Reserve records for me_pghead[]. Split it if multi-page, + * to avoid searching freeDB for a page range. Use keys in + * range [1,me_pglast]: Smaller than txnid of oldest reader. */ + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + /* Keep current record (overflow page), add a new one */ + head_id--; + refill_idx++; + head_room = 0; + } + + if (lifo) { + if (refill_idx > + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) { + /* LY: need just a txn-id for save page list. */ + rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); + if (likely(rc == 0)) + /* LY: ok, reclaimed from freedb. */ + continue; + if (unlikely(rc != MDB_NOTFOUND)) + /* LY: other troubles... */ + goto bailout; + + /* LY: freedb is empty, will look any free txn-id in high2low order. + */ + if (unlikely(env->me_pglast < 1)) { + /* LY: not any txn in the past of freedb. */ + rc = MDB_MAP_FULL; + goto bailout; + } + + if (unlikely(!txn->mt_lifo_reclaimed)) { + txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); + if (unlikely(!txn->mt_lifo_reclaimed)) { + rc = ENOMEM; + goto bailout; + } + } + /* LY: append the list. */ + rc = mdbx_midl_append(&txn->mt_lifo_reclaimed, env->me_pglast - 1); + if (unlikely(rc)) + goto bailout; + --env->me_pglast; + /* LY: note that freeDB cleanup is not needed. */ + ++cleanup_idx; + } + head_id = txn->mt_lifo_reclaimed[refill_idx]; + } + + /* (Re)write {key = head_id, IDL length = head_room} */ + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + /* Overflow multi-page for part of me_pghead */ + head_room /= head_id; /* amortize page sizes */ + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + /* Rare case, not bothering to delete this record */ + head_room = 0; + continue; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (unlikely(rc)) + goto bailout; + /* IDL is initially empty, zero out at least the length */ + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; + } + + mdbx_tassert(txn, + cleanup_idx == + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); + + /* Return loose page numbers to me_pghead, though usually none are + * left at this point. The pages themselves remain in dirty_list. */ + if (txn->mt_loose_pgs) { + MDB_page *mp = txn->mt_loose_pgs; + unsigned count = txn->mt_loose_count; + MDB_IDL loose; + /* Room for loose pages + temp IDL with same */ + if ((rc = mdbx_midl_need(&env->me_pghead, 2 * count + 1)) != 0) + goto bailout; + mop = env->me_pghead; + loose = mop + MDB_IDL_ALLOCLEN(mop) - count; + for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) + loose[++count] = mp->mp_pgno; + loose[0] = count; + mdbx_midl_sort(loose); + mdbx_midl_xmerge(mop, loose); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + mop_len = mop[0]; + } + + /* Fill in the reserved me_pghead records */ + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + if (!lifo) { + rc = mdbx_cursor_first(&mc, &key, &data); + if (unlikely(rc)) + goto bailout; + } + + for (;;) { + txnid_t id; + ssize_t len; + MDB_ID save; + + if (!lifo) { + id = *(txnid_t *)key.mv_data; + mdbx_tassert(txn, id <= env->me_pglast); + } else { + mdbx_tassert(txn, + refill_idx > 0 && refill_idx <= txn->mt_lifo_reclaimed[0]); + id = txn->mt_lifo_reclaimed[refill_idx--]; + key.mv_data = &id; + key.mv_size = sizeof(id); + rc = mdbx_cursor_get(&mc, &key, &data, MDB_SET); + if (unlikely(rc)) + goto bailout; + } + mdbx_tassert( + txn, cleanup_idx == + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); + + len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + mdbx_tassert(txn, len >= 0); + if (len > mop_len) + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + key.mv_data = &id; + key.mv_size = sizeof(id); + data.mv_data = mop -= len; + + save = mop[0]; + mop[0] = len; + rc = mdbx_cursor_put(&mc, &key, &data, MDB_CURRENT); + mdbx_tassert( + txn, cleanup_idx == + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); + mop[0] = save; + if (unlikely(rc || (mop_len -= len) == 0)) + goto bailout; + + if (!lifo) { + rc = mdbx_cursor_next(&mc, &key, &data, MDB_NEXT); + if (unlikely(rc)) + goto bailout; + } + } + } + +bailout: + if (txn->mt_lifo_reclaimed) { + mdbx_tassert(txn, rc || cleanup_idx == txn->mt_lifo_reclaimed[0]); + if (rc == 0 && cleanup_idx != txn->mt_lifo_reclaimed[0]) { + mdbx_tassert(txn, cleanup_idx < txn->mt_lifo_reclaimed[0]); + /* LY: zeroed cleanup_idx to force cleanup & refill created freeDB + * records. */ + cleanup_idx = 0; + /* LY: restart filling */ + refill_idx = total_room = head_room = 0; + more = 1; + goto again; + } + txn->mt_lifo_reclaimed[0] = 0; + if (txn != env->me_txn0) { + mdbx_midl_free(txn->mt_lifo_reclaimed); + txn->mt_lifo_reclaimed = NULL; + } + } + + return rc; +} + +/** Flush (some) dirty pages to the map, after clearing their dirty flag. + * @param[in] txn the transaction that's being committed + * @param[in] keep number of initial pages in dirty_list to keep dirty. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_flush(MDB_txn *txn, int keep) { + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + int n = 0; + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + /* Clear dirty flags */ + while (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE | P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[++j] = dl[i]; + continue; + } + dp->mp_flags &= ~P_DIRTY; + env->me_sync_pending += IS_OVERFLOW(dp) ? psize * dp->mp_pages : psize; + } + goto done; + } + + /* Write the pages */ + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE | P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[i].mid = 0; + continue; + } + pgno = dl[i].mid; + /* clear dirty flag */ + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) + size *= dp->mp_pages; + env->me_sync_pending += size; + } + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos != next_pos || n == MDB_COMMIT_PAGES || wsize + size > MAX_WRITE) { + if (n) { + retry: + /* Write previous page(s) */ + wres = pwritev(env->me_fd, iov, n, wpos); + if (unlikely(wres != wsize)) { + if (wres < 0) { + rc = errno; + if (rc == EINTR) + goto retry; + mdbx_debug("Write error: %s", strerror(rc)); + } else { + rc = EIO; /* TODO: Use which error code? */ + mdbx_debug("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + mdbx_debug("committing page %zu", pgno); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; + } + + mdbx_invalidate_cache(env->me_map, txn->mt_next_pgno * env->me_psize); + + for (i = keep; ++i <= pagecount;) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdbx_dpage_free(env, dp); + } + +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; +} + +int mdbx_txn_commit(MDB_txn *txn) { + int rc; + unsigned i, end_mode; + MDB_env *env; + + if (unlikely(txn == NULL)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(txn->mt_env->me_pid != getpid())) { + txn->mt_env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + /* mdbx_txn_end() mode for a commit which writes nothing */ + end_mode = + MDB_END_EMPTY_COMMIT | MDB_END_UPDATE | MDB_END_SLOT | MDB_END_FREE; + + if (txn->mt_child) { + rc = mdbx_txn_commit(txn->mt_child); + txn->mt_child = NULL; + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + } + + env = txn->mt_env; + + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) { + goto done; + } + + if (unlikely(txn->mt_flags & (MDB_TXN_FINISHED | MDB_TXN_ERROR))) { + mdbx_debug("error flag is set, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } + + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_page **lp; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; + + /* Append our reclaim list to parent's */ + if (txn->mt_lifo_reclaimed) { + if (parent->mt_lifo_reclaimed) { + rc = mdbx_midl_append_list(&parent->mt_lifo_reclaimed, + txn->mt_lifo_reclaimed); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + mdbx_midl_free(txn->mt_lifo_reclaimed); + } else + parent->mt_lifo_reclaimed = txn->mt_lifo_reclaimed; + txn->mt_lifo_reclaimed = NULL; + } + + /* Append our free list to parent's */ + rc = mdbx_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + mdbx_midl_free(txn->mt_free_pgs); + /* Failures after this must either undo the changes + * to the parent or set MDB_TXN_ERROR in the parent. */ + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + /* Merge our cursors into parent's and close them */ + mdbx_cursors_eot(txn, 1); + + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; + parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + /* preserve parent's DB_NEW status */ + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + /* Remove anything in our dirty list from parent's spill list */ + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + /* Mark our dirty pages as deleted in parent spill list */ + for (i = 0, len = src[0].mid; ++i <= len;) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + /* Squash deleted pagenums if we deleted any */ + for (x = y; ++x <= ps_len;) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + /* Remove anything in our spill list from parent's dirty list */ + if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { + for (i = 1; i <= txn->mt_spill_pgs[0]; i++) { + MDB_ID pn = txn->mt_spill_pgs[i]; + if (pn & 1) + continue; /* deleted spillpg */ + pn >>= 1; + y = mdbx_mid2l_search(dst, pn); + if (y <= dst[0].mid && dst[y].mid == pn) { + free(dst[y].mptr); + while (y < dst[0].mid) { + dst[y] = dst[y + 1]; + y++; + } + dst[0].mid--; + } + } + } + + /* Find len = length of merging our dirty list with parent's */ + x = dst[0].mid; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdbx_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdbx_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + /* TODO: Prevent failure here, so parent does not fail */ + rc = mdbx_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (unlikely(rc != MDB_SUCCESS)) + parent->mt_flags |= MDB_TXN_ERROR; + mdbx_midl_free(txn->mt_spill_pgs); + mdbx_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + /* Append our loose page list to parent's */ + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) + ; + *lp = txn->mt_loose_pgs; + parent->mt_loose_count += txn->mt_loose_count; + + parent->mt_child = NULL; + mdbx_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + txn->mt_signature = 0; + free(txn); + return rc; + } + + env = txn->mt_env; + if (unlikely(txn != env->me_txn)) { + mdbx_debug("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdbx_cursors_eot(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY | MDB_TXN_SPILLS))) + goto done; + + mdbx_debug("committing txn %zu %p on mdbenv %p, root page %zu", txn->mt_txnid, + (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + + /* Update DB root pointers */ + if (txn->mt_numdbs > CORE_DBS) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + if (unlikely(TXN_DBI_CHANGED(txn, i))) { + rc = MDB_BAD_DBI; + goto fail; + } + data.mv_data = &txn->mt_dbs[i]; + rc = mdbx_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, F_SUBDATA); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + } + } + } + + rc = mdbx_freelist_save(txn); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + + mdbx_midl_free(env->me_pghead); + env->me_pghead = NULL; + mdbx_midl_shrink(&txn->mt_free_pgs); + + if (mdbx_audit_enabled()) + mdbx_audit(txn); + + rc = mdbx_page_flush(txn, 0); + if (likely(rc == MDB_SUCCESS)) { + MDB_meta meta; + + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; +#if MDBX_MODE_ENABLED + meta.mm_canary = txn->mt_canary; +#endif + + rc = mdbx_env_sync0(env, env->me_flags | txn->mt_flags, &meta); + } + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + end_mode = MDB_END_COMMITTED | MDB_END_UPDATE; + +done: + return mdbx_txn_end(txn, end_mode); + +fail: + mdbx_txn_abort(txn); + return rc; +} + +/** Read the environment parameters of a DB environment before + * mapping it into memory. + * @param[in] env the environment handle + * @param[out] meta address of where to store the meta information + * @return 0 on success, non-zero on failure. + */ +static int __cold mdbx_env_read_header(MDB_env *env, MDB_meta *meta) { + MDB_metabuf pbuf; + MDB_page *p; + MDB_meta *m; + int i, rc, off; + enum { Size = sizeof(pbuf) }; + + /* We don't know the page size yet, so use a minimum value. + * Read both meta pages so we can use the latest one. + */ + + meta->mm_datasync_sign = MDB_DATASIGN_WEAK; + meta->mm_txnid = 0; + for (i = off = 0; i < NUM_METAS; i++, off += meta->mm_psize) { + rc = pread(env->me_fd, &pbuf, Size, off); + if (rc != Size) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int)errno : MDB_INVALID; + mdbx_debug("read: %s", mdbx_strerror(rc)); + return rc; + } + + p = (MDB_page *)&pbuf; + + if (!F_ISSET(p->mp_flags, P_META)) { + mdbx_debug("page %zu not a meta page", p->mp_pgno); + return MDB_INVALID; + } + + m = PAGEDATA(p); + if (m->mm_magic != MDB_MAGIC) { + mdbx_debug("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_DATA_VERSION) { + mdbx_debug("database is version %u, expected version %u", m->mm_version, + MDB_DATA_VERSION); + return MDB_VERSION_MISMATCH; + } + + if (m->mm_datasync_sign > MDB_DATASIGN_WEAK && + m->mm_datasync_sign != mdbx_meta_sign(m)) + continue; + + if (mdbx_meta_lt(meta, m)) + *meta = *m; + } + + if (meta->mm_datasync_sign == MDB_DATASIGN_WEAK) + /* LY: Both meta-pages are weak. */ + return MDB_CORRUPTED; + + return MDB_SUCCESS; +} + +/** Fill in most of the zeroed #MDB_meta for an empty database environment */ +static void __cold mdbx_env_init_meta0(MDB_env *env, MDB_meta *meta) { + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = env->me_psize; + meta->mm_last_pg = NUM_METAS - 1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; + meta->mm_datasync_sign = mdbx_meta_sign(meta); +} + +/** Write the environment parameters of a freshly created DB environment. + * @param[in] env the environment handle + * @param[in] meta the #MDB_meta to write + * @return 0 on success, non-zero on failure. + */ +static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { + MDB_page *p, *q; + int rc; + unsigned psize; + int len; + + mdbx_debug("writing new meta page"); + + psize = env->me_psize; + + p = calloc(NUM_METAS, psize); + if (!p) + return ENOMEM; + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)PAGEDATA(p) = *meta; + + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)PAGEDATA(q) = *meta; + + do + len = pwrite(env->me_fd, p, psize * NUM_METAS, 0); + while (len == -1 && errno == EINTR); + + if (len < 0) + rc = errno; + else if ((unsigned)len == psize * NUM_METAS) + rc = MDB_SUCCESS; + else + rc = ENOSPC; + free(p); + return rc; +} + +static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { + int rc; + MDB_meta *head = mdbx_meta_head_w(env); + size_t prev_mapsize = head->mm_mapsize; + size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + + mdbx_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); + mdbx_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); + mdbx_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 || + env->me_mapsize != prev_mapsize); + + pending->mm_mapsize = env->me_mapsize; + mdbx_assert(env, pending->mm_mapsize >= used_size); + if (unlikely(pending->mm_mapsize != prev_mapsize)) { + if (pending->mm_mapsize < prev_mapsize) { + /* LY: currently this can't happen, but force full-sync. */ + flags &= MDB_WRITEMAP; + } else { + /* Persist any increases of mapsize config */ + } + } + + if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) + flags &= MDB_WRITEMAP; + + /* LY: step#1 - sync previously written/updated data-pages */ + if (env->me_sync_pending && (flags & MDB_NOSYNC) == 0) { + if (env->me_flags & MDB_WRITEMAP) { + int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + if (unlikely(msync(env->me_map, used_size, mode))) { + rc = errno; + /* LY: msync() should never return EINTR */ + goto fail; + } + if ((flags & MDB_MAPASYNC) == 0) + env->me_sync_pending = 0; + } else { + int (*flush)(int fd) = fdatasync; + if (unlikely(prev_mapsize != pending->mm_mapsize)) { + /* LY: It is no reason to use fdatasync() here, even in case + * no such bug in a kernel. Because "no-bug" mean that a kernel + * internally do nearly the same, e.g. fdatasync() == fsync() + * when no-kernel-bug and file size was changed. + * + * So, this code is always safe and without appreciable + * performance degradation. + * + * For more info about of a corresponding fdatasync() bug + * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ + flush = fsync; + } + while (unlikely(flush(env->me_fd) < 0)) { + rc = errno; + if (rc != EINTR) + goto fail; + } + env->me_sync_pending = 0; + } + } + + /* LY: step#2 - update meta-page. */ + if (env->me_sync_pending == 0) { + pending->mm_datasync_sign = mdbx_meta_sign(pending); + } else { + pending->mm_datasync_sign = + (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC + ? MDB_DATASIGN_NONE + : MDB_DATASIGN_WEAK; + } + + volatile MDB_meta *target = + (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) + ? head + : mdbx_env_meta_flipflop(env, head); + off_t offset = (char *)target - env->me_map; + + MDB_meta *stay = mdbx_env_meta_flipflop(env, (MDB_meta *)target); + mdbx_debug( + "writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, " + "txn_id %zu, %s", + offset >= env->me_psize, target == head ? "head" : "tail", + target->mm_txnid, + META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" + : "Legacy", + stay == head ? "head" : "tail", stay->mm_txnid, + META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy", + pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid, + META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" + : "Legacy"); + + if (env->me_flags & MDB_WRITEMAP) { +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + /* LY: 'invalidate' the meta, + * but mdbx_meta_head_r() will be confused/retired in collision case. */ + target->mm_datasync_sign = MDB_DATASIGN_WEAK; + target->mm_txnid = 0; + /* LY: update info */ + target->mm_mapsize = pending->mm_mapsize; + target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; + target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; + target->mm_last_pg = pending->mm_last_pg; +#if MDBX_MODE_ENABLED + target->mm_canary = pending->mm_canary; +#endif + /* LY: 'commit' the meta */ + target->mm_txnid = pending->mm_txnid; + target->mm_datasync_sign = pending->mm_datasync_sign; + } else { + pending->mm_magic = MDB_MAGIC; + pending->mm_version = MDB_DATA_VERSION; + pending->mm_address = head->mm_address; + retry: + rc = pwrite(env->me_fd, pending, sizeof(MDB_meta), offset); + if (unlikely(rc != sizeof(MDB_meta))) { + rc = (rc < 0) ? errno : EIO; + if (rc == EINTR) + goto retry; + + undo: + mdbx_debug("write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Write some old data back, to prevent it from being used. */ + if (pwrite(env->me_fd, (void *)target, sizeof(MDB_meta), offset) == + sizeof(MDB_meta)) { + /* LY: take a chance, if write succeeds at a magic ;) */ + goto retry; + } + goto fail; + } + mdbx_invalidate_cache(env->me_map + offset, sizeof(MDB_meta)); +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + } + + /* Memory ordering issues are irrelevant; since the entire writer + * is wrapped by wmutex, all of these changes will become visible + * after the wmutex is unlocked. Since the DB is multi-version, + * readers will get consistent data regardless of how fresh or + * how stale their view of these values is. + */ + env->me_txns->mti_txnid = pending->mm_txnid; +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + + /* LY: step#3 - sync meta-pages. */ + if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { + if (env->me_flags & MDB_WRITEMAP) { + char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); + int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + if (unlikely(msync(ptr, env->me_os_psize, mode) < 0)) { + rc = errno; + goto fail; + } + } else { + while (unlikely(fdatasync(env->me_fd) < 0)) { + rc = errno; + if (rc != EINTR) + goto undo; + } + } + } + + /* LY: currently this can't happen, but... */ + if (unlikely(pending->mm_mapsize < prev_mapsize)) { + mdbx_assert(env, pending->mm_mapsize == env->me_mapsize); + if (unlikely(mremap(env->me_map, prev_mapsize, pending->mm_mapsize, + MREMAP_FIXED, pending->mm_address) == MAP_FAILED)) { + rc = errno; + goto fail; + } + if (unlikely(ftruncate(env->me_fd, pending->mm_mapsize) < 0)) { + rc = errno; + goto fail; + } + } + + return MDB_SUCCESS; + +fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; +} + +int __cold mdbx_env_create(MDB_env **env) { + MDB_env *e; + + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; + + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = CORE_DBS; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VALGRIND_CREATE_MEMPOOL(e, 0, 0); + e->me_signature = MDBX_ME_SIGNATURE; + *env = e; + return MDB_SUCCESS; +} + +static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { + unsigned flags = env->me_flags; + + int prot = PROT_READ; + if (flags & MDB_WRITEMAP) { + prot |= PROT_WRITE; + if (ftruncate(env->me_fd, env->me_mapsize) < 0) + return errno; + } + + env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return errno; + } + + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + if (addr && env->me_map != addr) { + errno = 0; /* LY: clean errno as a hit for this case */ + return EBUSY; /* TODO: Make a new MDB_* error code? */ + } + + if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) + return errno; + +#ifdef MADV_NOHUGEPAGE + (void)madvise(env->me_map, env->me_mapsize, MADV_NOHUGEPAGE); +#endif + +#ifdef MADV_DONTDUMP + if (!(flags & MDBX_PAGEPERTURB)) { + (void)madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); + } +#endif + +#ifdef MADV_REMOVE + if (flags & MDB_WRITEMAP) { + (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, + MADV_REMOVE); + } +#endif + + /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ + if (madvise(env->me_map, env->me_mapsize, + (flags & MDB_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) + return errno; + + /* Lock meta pages to avoid unexpected write, + * before the data pages would be synchronized. */ + if ((flags & MDB_WRITEMAP) && mlock(env->me_map, env->me_psize * 2)) + return errno; + +#ifdef USE_VALGRIND + env->me_valgrind_handle = + VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "lmdb"); +#endif + + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(size < env->me_psize * 8)) + return EINVAL; + + /* If env is already open, caller is responsible for making + * sure there are no active txns. + */ + if (env->me_map) { + int rc; + MDB_meta *meta; + void *old; + if (env->me_txn) + return EINVAL; + meta = mdbx_meta_head_w(env); + if (!size) + size = meta->mm_mapsize; + /* Silently round up to minimum if the size is too small */ + const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; + if (size < usedsize) + size = usedsize; + munmap(env->me_map, env->me_mapsize); +#ifdef USE_VALGRIND + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = -1; +#endif + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdbx_env_map(env, old, usedsize); + if (rc) + return rc; + } + env->me_mapsize = size; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(env->me_map)) + return EINVAL; + + env->me_maxdbs = dbs + CORE_DBS; + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { + if (unlikely(!env || readers < 1)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(env->me_map)) + return EINVAL; + + env->me_maxreaders = readers; + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { + if (!env || !readers) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + *readers = env->me_maxreaders; + return MDB_SUCCESS; +} + +static int __cold mdbx_fsize(HANDLE fd, size_t *size) { + struct stat st; + + if (fstat(fd, &st)) + return errno; + + *size = st.st_size; + return MDB_SUCCESS; +} + +/** Further setup required for opening an LMDB environment + */ +static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { + unsigned flags = env->me_flags; + int i, newenv = 0, rc; + + if ((i = mdbx_env_read_header(env, meta)) != 0) { + if (i != ENOENT) + return i; + mdbx_debug("new mdbenv"); + newenv = 1; + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; + memset(meta, 0, sizeof(*meta)); + mdbx_env_init_meta0(env, meta); + meta->mm_mapsize = DEFAULT_MAPSIZE; + } else { + env->me_psize = meta->mm_psize; + } + + /* Was a mapsize configured? */ + if (!env->me_mapsize) { + env->me_mapsize = meta->mm_mapsize; + } + { + /* Make sure mapsize >= committed data size. Even when using + * mm_mapsize, which could be broken in old files (ITS#7789). + */ + size_t minsize = (meta->mm_last_pg + 1) * meta->mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; + } + meta->mm_mapsize = env->me_mapsize; + + if (newenv && !(flags & MDB_FIXEDMAP)) { + /* mdbx_env_map() may grow the datafile. Write the metapages + * first, so the file will be valid if initialization fails. + * Except with FIXEDMAP, since we do not yet know mm_address. + * We could fill in mm_address later, but then a different + * program might end up doing that - one with a memory layout + * and map address which does not suit the main program. + */ + rc = mdbx_env_init_meta(env, meta); + if (rc) + return rc; + newenv = 0; + } + + const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; + rc = mdbx_env_map(env, (flags & MDB_FIXEDMAP) ? meta->mm_address : NULL, + usedsize); + if (rc) + return rc; + + if (newenv) { + if (flags & MDB_FIXEDMAP) + meta->mm_address = env->me_map; + i = mdbx_env_init_meta(env, meta); + if (i != MDB_SUCCESS) { + return i; + } + } + + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = + (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - sizeof(indx_t); + env->me_maxkey_limit = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); + env->me_maxpg = env->me_mapsize / env->me_psize; + + if (MDB_MAXKEYSIZE > env->me_maxkey_limit) + return MDB_BAD_VALSIZE; + + return MDB_SUCCESS; +} + +/****************************************************************************/ + +#ifndef MDBX_USE_THREAD_ATEXIT +#if __GLIBC_PREREQ(2, 18) +#define MDBX_USE_THREAD_ATEXIT 1 +#else +#define MDBX_USE_THREAD_ATEXIT 0 +#endif +#endif + +static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; +static MDBX_rthc *mdbx_rthc_list; +static pthread_key_t mdbx_pthread_crutch_key; + +static __inline void mdbx_rthc_lock(void) { + mdbx_ensure(NULL, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); +} + +static __inline void mdbx_rthc_unlock(void) { + mdbx_ensure(NULL, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); +} + +/** Release a reader thread's slot in the reader lock table. + * This function is called automatically when a thread exits. + * @param[in] ptr This points to the MDB_rthc of a slot in the reader lock + *table. + */ +static __cold void mdbx_rthc_dtor(void) { + /* LY: Основная задача этого деструктора была и есть в освобождении + * слота таблицы читателей при завершении треда, но тут есть пара + * не очевидных сложностей: + * - Таблица читателей располагается в разделяемой памяти, поэтому + * во избежание segfault деструктор не должен что-либо делать после + * или одновременно с mdbx_env_close(). + * - Действительно, mdbx_env_close() вызовет pthread_key_delete() и + * после этого glibc не будет вызывать деструктор. + * - ОДНАКО, это никак не решает проблему гонок между mdbx_env_close() + * и завершающимися тредами. Грубо говоря, при старте mdbx_env_close() + * деструктор уже может выполняться в некоторых тредах, и завершиться + * эти выполнения могут во время или после окончания mdbx_env_close(). + * - БОЛЕЕ ТОГО, схожая проблема возникает при выгрузке dso/dll, + * так как в текущей glibc (2.24) подсистема ld.so ничего не знает о + * TSD-деструкторах и поэтому может выгрузить lib.so до того как + * отработали все деструкторы. + * - Исходное проявление проблемы было зафиксировано + * в https://github.com/ReOpen/ReOpenLDAP/issues/48 + * + * Предыдущее решение посредством выделяемого динамически MDB_rthc + * было не удачным, так как порождало либо утечку памяти, + * либо вероятностное обращение к уже освобожденной памяти + * из этого деструктора. + * + * Текущее решение достаточно "развесисто", но решает все описанные выше + * проблемы без пенальти по производительности. + */ + + mdbx_rthc_lock(); + + pid_t pid = getpid(); + pthread_t thread = pthread_self(); + for (MDBX_rthc **ref = &mdbx_rthc_list; *ref;) { + MDBX_rthc *rthc = *ref; + if (rthc->rc_thread == thread) { + if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { + rthc->rc_reader->mr_pid = 0; + mdbx_coherent_barrier(); + } + *ref = rthc->rc_next; + free(rthc); + } else { + ref = &(*ref)->rc_next; + } + } + + mdbx_rthc_unlock(); +} + +#if MDBX_USE_THREAD_ATEXIT + +extern void *__dso_handle __attribute__((__weak__)); +extern int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, + void *dso_symbol); + +static __cold void mdbx_rthc__thread_atexit(void *ptr) { + mdbx_ensure(NULL, ptr == pthread_getspecific(mdbx_pthread_crutch_key)); + mdbx_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, NULL) == 0); + mdbx_rthc_dtor(); +} + +static __attribute__((constructor)) __cold void mdbx_pthread_crutch_ctor(void) { + mdbx_ensure(NULL, pthread_key_create(&mdbx_pthread_crutch_key, NULL) == 0); +} + +#else /* MDBX_USE_THREAD_ATEXIT */ + +static __cold void mdbx_rthc__thread_key_dtor(void *ptr) { + (void)ptr; + if (mdbx_pthread_crutch_key != (pthread_key_t)-1) + mdbx_rthc_dtor(); +} + +static __attribute__((constructor)) __cold void mdbx_pthread_crutch_ctor(void) { + mdbx_ensure(NULL, pthread_key_create(&mdbx_pthread_crutch_key, + mdbx_rthc__thread_key_dtor) == 0); +} + +static __attribute__((destructor)) __cold void mdbx_pthread_crutch_dtor(void) { + pthread_key_delete(mdbx_pthread_crutch_key); + mdbx_pthread_crutch_key = -1; + + /* LY: Из-за race condition в pthread_key_delete() + * деструкторы уже могли начать выполняться. + * Уступая квант времени сразу после удаления ключа + * мы даем им шанс завершиться. */ + pthread_yield(); + + mdbx_rthc_lock(); + pid_t pid = getpid(); + while (mdbx_rthc_list != NULL) { + MDBX_rthc *rthc = mdbx_rthc_list; + mdbx_rthc_list = mdbx_rthc_list->rc_next; + if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { + rthc->rc_reader->mr_pid = 0; + mdbx_coherent_barrier(); + } + free(rthc); + + /* LY: Каждый неудаленный элемент списка - это один + * не отработавший деструктор и потенциальный + * шанс получить segfault после выгрузки lib.so + * Поэтому на каждой итерации уступаем квант времени, + * в надежде что деструкторы успеют отработать. */ + mdbx_rthc_unlock(); + pthread_yield(); + mdbx_rthc_lock(); + } + mdbx_rthc_unlock(); + pthread_yield(); +} +#endif /* MDBX_USE_THREAD_ATEXIT */ + +static __cold MDBX_rthc *mdbx_rthc_add(pthread_key_t key) { + MDBX_rthc *rthc = malloc(sizeof(MDBX_rthc)); + if (unlikely(rthc == NULL)) + goto bailout; + + rthc->rc_next = NULL; + rthc->rc_reader = NULL; + rthc->rc_thread = pthread_self(); + if (unlikely(pthread_setspecific(key, rthc) != 0)) + goto bailout_free; + + mdbx_rthc_lock(); + if (pthread_getspecific(mdbx_pthread_crutch_key) == NULL) { +#if MDBX_USE_THREAD_ATEXIT + void *dso_anchor = + (&__dso_handle && __dso_handle) ? __dso_handle : (void *)mdbx_version; + if (unlikely(__cxa_thread_atexit_impl(mdbx_rthc__thread_atexit, rthc, + dso_anchor) != 0)) { + mdbx_rthc_unlock(); + goto bailout_free; + } +#endif /* MDBX_USE_THREAD_ATEXIT */ + mdbx_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, rthc) == 0); + } + rthc->rc_next = mdbx_rthc_list; + mdbx_rthc_list = rthc; + mdbx_rthc_unlock(); + return rthc; + +bailout_free: + free(rthc); +bailout: + return NULL; +} + +static __inline MDBX_rthc *mdbx_rthc_get(pthread_key_t key) { + MDBX_rthc *rthc = pthread_getspecific(key); + if (likely(rthc != NULL)) + return rthc; + return mdbx_rthc_add(key); +} + +static __cold void mdbx_rthc_cleanup(MDB_env *env) { + mdbx_rthc_lock(); + + MDB_reader *begin = env->me_txns->mti_readers; + MDB_reader *end = begin + env->me_close_readers; + for (MDBX_rthc **ref = &mdbx_rthc_list; *ref;) { + MDBX_rthc *rthc = *ref; + if (rthc->rc_reader >= begin && rthc->rc_reader < end) { + if (rthc->rc_reader->mr_pid == env->me_pid) { + rthc->rc_reader->mr_pid = 0; + mdbx_coherent_barrier(); + } + *ref = rthc->rc_next; + free(rthc); + } else { + ref = &(*ref)->rc_next; + } + } + + mdbx_rthc_unlock(); +} + +/****************************************************************************/ + +/** Downgrade the exclusive lock on the region back to shared */ +static __cold int mdbx_env_share_locks(MDB_env *env, int *excl) { + struct flock lock_info; + int rc = 0; + + /* The shared lock replaces the existing lock */ + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_RDLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = errno) == EINTR) + ; + *excl = rc ? -1 : 0; /* error may mean we lost the lock */ + + return rc; +} + +/** Try to get exclusive lock, otherwise shared. + * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. + */ +static int __cold mdbx_env_excl_lock(MDB_env *env, int *excl) { + int rc = 0; + struct flock lock_info; + + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = errno) == EINTR) + ; + if (!rc) { + *excl = 1; + } else { + lock_info.l_type = F_RDLCK; + while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && + (rc = errno) == EINTR) + ; + if (rc == 0) + *excl = 0; + } + return rc; +} + +#ifdef MDB_USE_HASH +/* + * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code + * + * @(#) $Revision: 5.1 $ + * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ + * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html + * + *** + * + * Please do not copyright this code. This code is in the public domain. + * + * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO + * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + * + * By: + * chongo /\oo/\ + * http://www.isthe.com/chongo/ + * + * Share and Enjoy! :-) + */ + +typedef unsigned long long mdbx_hash_t; +#define MDB_HASH_INIT ((mdbx_hash_t)0xcbf29ce484222325ULL) + +/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * @param[in] val value to hash + * @param[in] hval initial value for hash + * @return 64 bit hash + * + * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the + * hval arg on the first call. + */ +static mdbx_hash_t mdbx_hash_val(MDB_val *val, mdbx_hash_t hval) { + unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ + unsigned char *end = s + val->mv_size; + /* + * FNV-1a hash each octet of the string + */ + while (s < end) { + /* xor the bottom with the current octet */ + hval ^= (mdbx_hash_t)*s++; + + /* multiply by the 64 bit FNV magic prime mod 2^64 */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + + (hval << 8) + (hval << 40); + } + /* return our new hash value */ + return hval; +} + +/** Hash the string and output the encoded hash. + * This uses modified RFC1924 Ascii85 encoding to accommodate systems with + * very short name limits. We don't care about the encoding being reversible, + * we just want to preserve as many bits of the input as possible in a + * small printable string. + * @param[in] str string to hash + * @param[out] encbuf an array of 11 chars to hold the hash + */ +static const char mdbx_a85[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij" + "klmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; + +static void __cold mdbx_pack85(unsigned long l, char *out) { + int i; + + for (i = 0; i < 5; i++) { + *out++ = mdbx_a85[l % 85]; + l /= 85; + } +} + +static void __cold mdbx_hash_enc(MDB_val *val, char *encbuf) { + mdbx_hash_t h = mdbx_hash_val(val, MDB_HASH_INIT); + + mdbx_pack85(h, encbuf); + mdbx_pack85(h >> 32, encbuf + 5); + encbuf[10] = '\0'; +} +#endif + +/** Open and/or initialize the lock region for the environment. + * @param[in] env The LMDB environment. + * @param[in] lpath The pathname of the file used for the lock region. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive + * @return 0 on success, non-zero on failure. + */ +static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, + int *excl) { + int fdflags; + int rc; + off_t size, rsize; + void *m; + + env->me_lfd = open(lpath, O_RDWR | O_CREAT | O_CLOEXEC, mode); + if (env->me_lfd == INVALID_HANDLE_VALUE) { + rc = errno; + if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { + return MDB_SUCCESS; + } + return rc; + } + + /* Lose record locks when exec*() */ + if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_lfd, F_SETFD, fdflags); + + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, NULL); + if (rc) + return rc; + env->me_flags |= MDB_ENV_TXKEY; + } + + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. + */ + if ((rc = mdbx_env_excl_lock(env, excl))) + return rc; + + size = lseek(env->me_lfd, 0, SEEK_END); + if (size == -1) + return errno; + rsize = (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); + if (size 0) { + if (ftruncate(env->me_lfd, rsize) != 0) + return errno; + } else { + rsize = size; + size = rsize - sizeof(MDB_txninfo); + env->me_maxreaders = size / sizeof(MDB_reader) + 1; + } + + m = mmap(NULL, rsize, PROT_READ | PROT_WRITE, MAP_SHARED, env->me_lfd, 0); + if (m == MAP_FAILED) + return errno; + env->me_txns = m; + +#ifdef MADV_NOHUGEPAGE + (void)madvise(env->me_txns, rsize, MADV_NOHUGEPAGE); +#endif + +#ifdef MADV_DODUMP + (void)madvise(env->me_txns, rsize, MADV_DODUMP); +#endif + + if (madvise(env->me_txns, rsize, MADV_DONTFORK) < 0) + return errno; + + if (madvise(env->me_txns, rsize, MADV_WILLNEED) < 0) + return errno; + + if (madvise(env->me_txns, rsize, MADV_RANDOM) < 0) + return errno; + + if (*excl > 0) { + /* Solaris needs this before initing a robust mutex. Otherwise + * it may skip the init and return EBUSY "seems someone already + * inited" or EINVAL "it was inited differently". + */ + memset(&env->me_txns->mti_rmutex, 0, sizeof(env->me_txns->mti_rmutex)); + memset(&env->me_txns->mti_wmutex, 0, sizeof(env->me_txns->mti_wmutex)); + + pthread_mutexattr_t mattr; + rc = pthread_mutexattr_init(&mattr); + if (rc) + return rc; + + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); + +#if MDB_USE_ROBUST + if (!rc) + rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); +#endif /* MDB_USE_ROBUST */ + if (!rc) + rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr); + if (!rc) + rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); + + pthread_mutexattr_destroy(&mattr); + if (rc) + return rc; + + env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_format = MDB_LOCK_FORMAT; + env->me_txns->mti_txnid = ~0L; + env->me_txns->mti_numreaders = 0; + } else { + if (env->me_txns->mti_magic != MDB_MAGIC) { + mdbx_debug("lock region has invalid magic"); + return MDB_INVALID; + } + if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + mdbx_debug("lock region has format+version 0x%x, expected 0x%x", + env->me_txns->mti_format, MDB_LOCK_FORMAT); + return MDB_VERSION_MISMATCH; + } + } + + return MDB_SUCCESS; +} + +/** The name of the lock file in the DB environment */ +#define LOCKNAME "/lock.mdb" +/** The name of the data file in the DB environment */ +#define DATANAME "/data.mdb" +/** The suffix of the lock file when no subdir is used */ +#define LOCKSUFF "-lock" +/** Only a subset of the @ref mdbx_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE \ + (MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC | MDB_NOMEMINIT | \ + MDBX_COALESCE | MDBX_PAGEPERTURB) +#define CHANGELESS \ + (MDB_FIXEDMAP | MDB_NOSUBDIR | MDB_RDONLY | MDB_WRITEMAP | MDB_NOTLS | \ + MDB_NORDAHEAD | MDBX_LIFORECLAIM) + +#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE | CHANGELESS) +#error "Persistent DB flags & env flags overlap, but both go in mm_flags" +#endif + +int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, + mode_t mode, int *exclusive) { + int oflags, rc, len, excl = -1; + char *lpath, *dpath; + + if (unlikely(!env || !path)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (env->me_fd != INVALID_HANDLE_VALUE || + (flags & ~(CHANGEABLE | CHANGELESS))) + return EINVAL; + + len = strlen(path); + if (flags & MDB_NOSUBDIR) { + rc = len + sizeof(LOCKSUFF) + len + 1; + } else { + rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); + } + lpath = malloc(rc); + if (!lpath) + return ENOMEM; + if (flags & MDB_NOSUBDIR) { + dpath = lpath + len + sizeof(LOCKSUFF); + sprintf(lpath, "%s" LOCKSUFF, path); + strcpy(dpath, path); + } else { + dpath = lpath + len + sizeof(LOCKNAME); + sprintf(lpath, "%s" LOCKNAME, path); + sprintf(dpath, "%s" DATANAME, path); + } + + rc = MDB_SUCCESS; + flags |= env->me_flags; + if (flags & MDB_RDONLY) { + /* LY: silently ignore irrelevant flags when we're only getting read + * access */ + flags &= ~(MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOSYNC | MDB_NOMETASYNC | + MDBX_COALESCE | MDBX_LIFORECLAIM | MDB_NOMEMINIT); + } else { + if (!((env->me_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + rc = ENOMEM; + goto leave; + } + env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDB_INTEGERKEY */ + + /* For RDONLY, get lockfile after we know datafile exists */ + if (!(flags & MDB_RDONLY)) { + rc = mdbx_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + + if (F_ISSET(flags, MDB_RDONLY)) + oflags = O_RDONLY; + else + oflags = O_RDWR | O_CREAT; + + env->me_fd = open(dpath, oflags | O_CLOEXEC, mode); + if (env->me_fd == INVALID_HANDLE_VALUE) { + rc = errno; + goto leave; + } + + int fdflags; + if ((fdflags = fcntl(env->me_fd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_fd, F_SETFD, fdflags); + + if (flags & MDB_RDONLY) { + rc = mdbx_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + + MDB_meta meta; + if ((rc = mdbx_env_open2(env, &meta)) == MDB_SUCCESS) { + mdbx_debug("opened dbenv %p", (void *)env); + if (excl > 0) { + env->me_txns->mti_txnid = meta.mm_txnid; + if (exclusive == NULL || *exclusive < 2) { + /* LY: downgrade lock only if exclusive access not requested. + * in case exclusive==1, just leave value as is. */ + rc = mdbx_env_share_locks(env, &excl); + if (rc) + goto leave; + } + } else if (exclusive) { + /* LY: just indicate that is not an exclusive access. */ + *exclusive = 0; + } + if (!(flags & MDB_RDONLY)) { + MDB_txn *txn; + int tsize = sizeof(MDB_txn), + size = tsize + + env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + + sizeof(unsigned) + 1); + if ((env->me_pbuf = calloc(1, env->me_psize)) && + (txn = calloc(1, size))) { + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDB_TXN_FINISHED; + env->me_txn0 = txn; + } else { + rc = ENOMEM; + } + } + } + +#if MDB_DEBUG + if (rc == MDB_SUCCESS) { + MDB_meta *meta = mdbx_meta_head_r(env); + MDB_db *db = &meta->mm_dbs[MAIN_DBI]; + int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1; + + mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, + env->me_psize); + mdbx_debug("using meta page %d, txn %zu", toggle, meta->mm_txnid); + mdbx_debug("depth: %u", db->md_depth); + mdbx_debug("entries: %zu", db->md_entries); + mdbx_debug("branch pages: %zu", db->md_branch_pages); + mdbx_debug("leaf pages: %zu", db->md_leaf_pages); + mdbx_debug("overflow pages: %zu", db->md_overflow_pages); + mdbx_debug("root: %zu", db->md_root); + } +#endif + +leave: + if (rc) + mdbx_env_close0(env); + free(lpath); + return rc; +} + +int __cold mdbx_env_open(MDB_env *env, const char *path, unsigned flags, + mode_t mode) { + return mdbx_env_open_ex(env, path, flags, mode, NULL); +} + +/** Destroy resources from mdbx_env_open(), clear our readers & DBIs */ +static void __cold mdbx_env_close0(MDB_env *env) { + int i; + + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; + env->me_flags &= ~MDB_ENV_ACTIVE; + + /* Doing this here since me_dbxs may not exist during mdbx_env_close */ + if (env->me_dbxs) { + for (i = env->me_maxdbs; --i >= CORE_DBS;) + free(env->me_dbxs[i].md_name.mv_data); + free(env->me_dbxs); + } + + free(env->me_pbuf); + free(env->me_dbiseqs); + free(env->me_dbflags); + free(env->me_path); + free(env->me_dirty_list); + if (env->me_txn0) + mdbx_midl_free(env->me_txn0->mt_lifo_reclaimed); + free(env->me_txn0); + mdbx_midl_free(env->me_free_pgs); + + if (env->me_flags & MDB_ENV_TXKEY) { + mdbx_ensure(env, pthread_key_delete(env->me_txkey) == 0); + env->me_flags &= ~MDB_ENV_TXKEY; + } + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); +#ifdef USE_VALGRIND + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = -1; +#endif + } + if (env->me_fd != INVALID_HANDLE_VALUE) + (void)close(env->me_fd); + + /* Clearing readers is done in this function because + * me_txkey with its destructor must be disabled first. + * + * We skip the the reader mutex, so we touch only + * data owned by this process (me_close_readers and + * our readers), and clear each reader atomically. + */ + if (env->me_pid == getpid()) + mdbx_rthc_cleanup(env); + + munmap((void *)env->me_txns, + (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDB_txninfo)); + env->me_txns = NULL; + env->me_pid = 0; + + if (env->me_lfd != INVALID_HANDLE_VALUE) { + (void)close(env->me_lfd); + } +} + +int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { + MDB_page *dp; + int rc = MDB_SUCCESS; + + if (unlikely(!env)) + return EINVAL; + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (!dont_sync && env->me_txns) + rc = mdbx_env_sync(env, 1); + + VALGRIND_DESTROY_MEMPOOL(env); + while ((dp = env->me_dpages) != NULL) { + ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next)); + VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + } + + mdbx_env_close0(env); + env->me_signature = 0; + free(env); + + return rc; +} + +void __cold mdbx_env_close(MDB_env *env) { mdbx_env_close_ex(env, 0); } + +/* LY: fast enough on most arches + * + * / + * | -1, a < b + * cmp2int(a,b) = < 0, a == b + * | 1, a > b + * \ + */ +#if 1 +#define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) +#else +#define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) +#endif + +/** Compare two items pointing at aligned unsigned int's. */ +static int __hot mdbx_cmp_int_ai(const MDB_val *a, const MDB_val *b) { + mdbx_assert(NULL, a->mv_size == b->mv_size); + mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(int) && + 0 == (uintptr_t)b->mv_data % sizeof(int)); + + if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) + return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); + + mdbx_assert(NULL, a->mv_size == sizeof(int)); + return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); +} + +/** Compare two items pointing at 2-byte aligned unsigned int's. */ +static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { + mdbx_assert(NULL, a->mv_size == b->mv_size); + mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(uint16_t) && + 0 == (uintptr_t)b->mv_data % sizeof(uint16_t)); +#ifdef MISALIGNED_OK + if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) + return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); + + mdbx_assert(NULL, a->mv_size == sizeof(int)); + return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); +#else + mdbx_assert(NULL, 0 == a->mv_size % sizeof(uint16_t)); + { + int diff; + const uint16_t *pa, *pb, *end; + +#if BYTE_ORDER == LITTLE_ENDIAN + end = (const uint16_t *)a->mv_data; + pa = (const uint16_t *)((char *)a->mv_data + a->mv_size); + pb = (const uint16_t *)((char *)b->mv_data + a->mv_size); + do { + diff = *--pa - *--pb; +#else /* BYTE_ORDER */ + end = (const uint16_t *)((char *)a->mv_data + a->mv_size); + pa = (const uint16_t *)a->mv_data; + pb = (const uint16_t *)b->mv_data; + do { + diff = *pa++ - *pb++; +#endif /* BYTE_ORDER */ + if (likely(diff != 0)) + break; + } while (pa != end); + return diff; + } +#endif /* MISALIGNED_OK */ +} + +/** Compare two items pointing at unsigneds of unknown alignment. + * + * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp. + */ +static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { + mdbx_assert(NULL, a->mv_size == b->mv_size); +#if MISALIGNED_OK + if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) + return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); + + mdbx_assert(NULL, a->mv_size == sizeof(int)); + return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); +#else + mdbx_assert(NULL, a->mv_size == sizeof(int) || a->mv_size == sizeof(size_t)); +#if BYTE_ORDER == LITTLE_ENDIAN + { + int diff; + const uint8_t *pa, *pb; + + pa = (const uint8_t *)a->mv_data + a->mv_size; + pb = (const uint8_t *)b->mv_data + a->mv_size; + + do { + diff = *--pa - *--pb; + if (likely(diff != 0)) + break; + } while (pa != a->mv_data); + return diff; + } +#else /* BYTE_ORDER */ + return memcmp(a->mv_data, b->mv_data, a->mv_size); +#endif /* BYTE_ORDER */ +#endif /* MISALIGNED_OK */ +} + +/** Compare two items lexically */ +static int __hot mdbx_cmp_memn(const MDB_val *a, const MDB_val *b) { +/* LY: assumes that length of keys are NOT equal for most cases, + * if no then branch-prediction should mitigate the problem */ +#if 0 + /* LY: without branch instructions on x86, + * but isn't best for equal length of keys */ + int diff_len = mdbx_cmp2int(a->mv_size, b->mv_size); +#else + /* LY: best when length of keys are equal, + * but got a branch-penalty otherwise */ + if (unlikely(a->mv_size == b->mv_size)) + return memcmp(a->mv_data, b->mv_data, a->mv_size); + int diff_len = (a->mv_size < b->mv_size) ? -1 : 1; +#endif + size_t shortest = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; + int diff_data = memcmp(a->mv_data, b->mv_data, shortest); + return likely(diff_data) ? diff_data : diff_len; +} + +/** Compare two items in reverse byte order */ +static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { + const uint8_t *pa, *pb, *end; + + pa = (const uint8_t *)a->mv_data + a->mv_size; + pb = (const uint8_t *)b->mv_data + b->mv_size; + size_t minlen = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; + end = pa - minlen; + + while (pa != end) { + int diff = *--pa - *--pb; + if (likely(diff)) + return diff; + } + return mdbx_cmp2int(a->mv_size, b->mv_size); +} + +/** Search for key within a page, using binary search. + * Returns the smallest entry larger or equal to the key. + * If exactp is non-null, stores whether the found entry was an exact match + * in *exactp (1 or 0). + * Updates the cursor index with the index of the found entry. + * If no entry larger or equal to the key is found, returns NULL. + */ +static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, + int *exactp) { + unsigned i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; + + nkeys = NUMKEYS(mp); + + mdbx_debug("searching %u keys in %s %spage %zu", nkeys, + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdbx_dbg_pgno(mp)); + + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; + + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster mdbx_cmp_int_ai. + */ + if (cmp == mdbx_cmp_int_a2 && IS_BRANCH(mp)) + cmp = mdbx_cmp_int_ai; + + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_xsize; + node = NODEPTR(mp, 0); /* fake */ + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; + + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); + + rc = cmp(key, &nodekey); + if (IS_LEAF(mp)) + mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); + else + mdbx_debug("found branch index %u [%s -> %zu], rc = %i", i, + DKEY(&nodekey), NODEPGNO(node), rc); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } + + if (rc > 0) { /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + /* store the key index */ + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + /* There is no entry larger or equal to the key. */ + return NULL; + + /* nodeptr is fake for LEAF2 */ + return node; +} + +#if 0 +static void +mdbx_cursor_adjust(MDB_cursor *mc, func) { - if (unlikely(mc == NULL)) - return EINVAL; + MDB_cursor *m2; - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { + func(mc, m2); + } + } +} +#endif - if ((mc->mc_flags & C_INITIALIZED) == 0) - return MDBX_RESULT_TRUE; +/** Pop a page off the top of the cursor's stack. */ +static void mdbx_cursor_pop(MDB_cursor *mc) { + if (mc->mc_snum) { + mdbx_debug("popped page %zu off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - if (mc->mc_snum == 0) - return MDBX_RESULT_TRUE; - - if ((mc->mc_flags & C_EOF) - && mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - return MDBX_RESULT_TRUE; - - return MDBX_RESULT_FALSE; + mc->mc_snum--; + if (mc->mc_snum) { + mc->mc_top--; + } else { + mc->mc_flags &= ~C_INITIALIZED; + } + } } -static int mdbx_is_samedata(const MDB_val* a, const MDB_val* b) { - return a->iov_len == b->iov_len - && memcmp(a->iov_base, b->iov_base, a->iov_len) == 0; +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ +static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { + mdbx_debug("pushing page %zu on db %d cursor %p", mp->mp_pgno, DDBI(mc), + (void *)mc); + + if (unlikely(mc->mc_snum >= CURSOR_STACK)) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } + + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDB_SUCCESS; +} + +/** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc the cursor accessing the page. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be + * stored. + * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, + * 0=mapped page. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, + int *lvl) { + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (!(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). */ + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdbx_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) + goto mapped; + } + if (dl[0].mid) { + unsigned x = mdbx_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); + } + + if (unlikely(pgno >= txn->mt_next_pgno)) { + mdbx_debug("page %zu not found", pgno); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + level = 0; + +mapped: + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + +done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; +} + +/** Finish #mdbx_page_search() / #mdbx_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. + */ +static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; + + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; + + mdbx_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); + /* Don't assert on branch pages in the FreeDB. We can get here + * while in the process of rebalancing a FreeDB branch page; we must + * let that proceed. ITS#8336 + */ + mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); + mdbx_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); + + if (flags & (MDB_PS_FIRST | MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) { + i = NUMKEYS(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } + } else { + int exact; + node = mdbx_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdbx_cassert(mc, i > 0); + i--; + } + } + mdbx_debug("following index %u for key [%s]", i, DKEY(key)); + } + + mdbx_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) + return rc; + + mc->mc_ki[mc->mc_top] = i; + if (unlikely(rc = mdbx_cursor_push(mc, mp))) + return rc; + + ready: + if (flags & MDB_PS_MODIFY) { + if (unlikely((rc = mdbx_page_touch(mc)) != 0)) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + + if (unlikely(!IS_LEAF(mp))) { + mdbx_debug("internal error, index points to a %02X page!?", mp->mp_flags); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + mdbx_debug("found leaf page %zu for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null"); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDB_SUCCESS; +} + +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdbx_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int mdbx_page_search_lowest(MDB_cursor *mc) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if (unlikely(rc = mdbx_cursor_push(mc, mp))) + return rc; + return mdbx_page_search_root(mc, NULL, MDB_PS_FIRST); +} + +/** Search for the page a given key should be in. + * Push it and its parent pages on the cursor stack. + * @param[in,out] mc the cursor for this operation. + * @param[in] key the key to search for, or NULL for first/last page. + * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by #mdbx_cursor_first() and #mdbx_cursor_last(). + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { + int rc; + pgno_t root; + + /* Make sure the txn is still viable, then find the root from + * the txn's db table and set it as the root of the cursor's stack. + */ + if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) { + mdbx_debug("transaction has failed, must abort"); + return MDB_BAD_TXN; + } else { + /* Make sure we're using an up-to-date root */ + if (unlikely(*mc->mc_dbflag & DB_STALE)) { + MDB_cursor mc2; + if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + return MDB_BAD_DBI; + mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + if (unlikely((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDB_INCOMPATIBLE; /* not a named DB */ + rc = mdbx_node_read(&mc2, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *)data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (unlikely(root == P_INVALID)) { /* Tree is empty. */ + mdbx_debug("tree is empty"); + return MDB_NOTFOUND; + } + } + + mdbx_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + mdbx_debug("db %d root page %zu has flags 0x%X", DDBI(mc), root, + mc->mc_pg[0]->mp_flags); + + if (flags & MDB_PS_MODIFY) { + if (unlikely(rc = mdbx_page_touch(mc))) + return rc; + } + + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdbx_page_search_root(mc, key, flags); +} + +static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; + int rc; + + mdbx_debug("free ov page %zu (%u)", pg, ovpages); + /* If the page is dirty or on the spill list we just acquired it, + * so we should give it back to our current free list, if any. + * Otherwise put it onto the list of pages we freed in this txn. + * + * Won't create me_pghead: me_pglast must be inited along with it. + * Unsupported in nested txns: They would need to hide the page + * range in ancestor txns' dirty and spilled lists. + */ + if (env->me_pghead && !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdbx_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdbx_midl_need(&env->me_pghead, ovpages); + if (unlikely(rc)) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + /* This page is no longer spilled */ + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; + goto release; + } + /* Remove from dirty list */ + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (likely(x > 1)) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdbx_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PROBLEM; + } + } + txn->mt_dirty_room++; + if (!(env->me_flags & MDB_WRITEMAP)) + mdbx_dpage_free(env, mp); + release: + /* Insert in me_pghead */ + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j > i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + rc = mdbx_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (unlikely(rc)) + return rc; + } + mc->mc_db->md_overflow_pages -= ovpages; + return 0; +} + +/** Return the data associated with a given node. + * @param[in] mc The cursor for this operation. + * @param[in] leaf The node being read. + * @param[out] data Updated to point to the node's data. + * @return 0 on success, non-zero on failure. + */ +static MDBX_INLINE int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, + MDB_val *data) { + MDB_page *omp; /* overflow page */ + pgno_t pgno; + int rc; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + /* Read overflow data. + */ + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if (unlikely((rc = mdbx_page_get(mc, pgno, &omp, NULL)) != 0)) { + mdbx_debug("read overflow page %zu failed", pgno); + return rc; + } + data->mv_data = PAGEDATA(omp); + + return MDB_SUCCESS; +} + +int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; + + mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); + + if (unlikely(!key || !data || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + mdbx_cursor_init(&mc, txn, dbi, &mx); + return mdbx_cursor_set(&mc, key, data, MDB_SET, &exact); +} + +/** Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the + * specified sibling, if one exists. + * @param[in] mc The cursor for this operation. + * @param[in] move_right Non-zero if the right sibling is requested, + * otherwise the left sibling. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { + int rc; + MDB_node *indx; + MDB_page *mp; + + if (unlikely(mc->mc_snum < 2)) { + return MDB_NOTFOUND; /* root has no siblings */ + } + + mdbx_cursor_pop(mc); + mdbx_debug("parent page is page %zu, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); + + if (move_right + ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + mdbx_debug("no more keys left, moving to %s sibling", + move_right ? "right" : "left"); + if (unlikely((rc = mdbx_cursor_sibling(mc, move_right)) != MDB_SUCCESS)) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + mdbx_debug("just moving to %s index key %u", move_right ? "right" : "left", + mc->mc_ki[mc->mc_top]); + } + mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0)) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + return rc; + } + + mdbx_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + + return MDB_SUCCESS; +} + +/** Move the cursor to the next data item. */ +static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) { + MDB_page *mp; + MDB_node *leaf; + int rc; + + if ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mdbx_cursor_first(mc, key, data); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp) - 1) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (likely(rc == MDB_SUCCESS)) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + } + } + + mdbx_debug("cursor_next: top page is %zu in cursor %p", mdbx_dbg_pgno(mp), + (void *)mc); + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; + goto skip; + } + + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + mdbx_debug("=====> move to next sibling page"); + if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mdbx_debug("next page is %zu, key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + } else + mc->mc_ki[mc->mc_top]++; + +skip: + mdbx_debug("==> cursor points to page %zu with %u keys, key index %u", + mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdbx_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + } + if (data) { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the previous data item. */ +static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) { + MDB_page *mp; + MDB_node *leaf; + int rc; + + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdbx_cursor_last(mc, key, data); + if (unlikely(rc)) + return rc; + mc->mc_ki[mc->mc_top]++; + } + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (likely(rc == MDB_SUCCESS)) { + MDB_GET_KEY(leaf, key); + mc->mc_flags &= ~C_EOF; + } + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; + } + } + + mdbx_debug("cursor_prev: top page is %zu in cursor %p", mdbx_dbg_pgno(mp), + (void *)mc); + + mc->mc_flags &= ~(C_EOF | C_DEL); + + if (mc->mc_ki[mc->mc_top] == 0) { + mdbx_debug("=====> move to prev sibling page"); + if ((rc = mdbx_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + mdbx_debug("prev page is %zu, key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + } else + mc->mc_ki[mc->mc_top]--; + + mdbx_debug("==> cursor points to page %zu with %u keys, key index %u", + mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdbx_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + } + if (data) { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Set the cursor on a specific data item. */ +static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp) { + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; + + if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && + unlikely(key->mv_size != sizeof(unsigned) && + key->mv_size != sizeof(size_t))) { + mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); + return MDB_BAD_VALSIZE; + } + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + /* See if we're already on the right page */ + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; + + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_size = mc->mc_db->md_xsize; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, 0); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. + */ + mc->mc_ki[mc->mc_top] = 0; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc > 0) { + unsigned i; + unsigned nkeys = NUMKEYS(mp); + if (nkeys > 1) { + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, nkeys - 1, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, nkeys - 1); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* last node was the one we wanted */ + mc->mc_ki[mc->mc_top] = nkeys - 1; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + /* This is definitely the right page, skip search_page */ + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = + LEAF2KEY(mp, mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* current node was the one we wanted */ + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + mc->mc_flags &= ~C_EOF; + goto set2; + } + } + /* If any parents have right-sibs, search. + * Otherwise, there's nothing further. */ + for (i = 0; i < mc->mc_top; i++) + if (mc->mc_ki[i] < NUMKEYS(mc->mc_pg[i]) - 1) + break; + if (i == mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE && !exactp) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; + } + } else { + mc->mc_pg[0] = 0; + } + + rc = mdbx_page_search(mc, key, 0); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mp)); + +set2: + leaf = mdbx_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + /* MDB_SET specified and not an exact match. */ + return MDB_NOTFOUND; + } + + if (leaf == NULL) { + mdbx_debug("===> inexact leaf not found, goto sibling"); + if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { + mc->mc_flags |= C_EOF; + return rc; /* no entries matched */ + } + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } + +set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + } + if (likely(data)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDB_SET_RANGE, ex2p); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val olddata; + if (unlikely((rc = mdbx_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) + return rc; + rc = mc->mc_dbx->md_dcmp(data, &olddata); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + } + *data = olddata; + } else { + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + } + } + + /* The key already matches in all other cases */ + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + mdbx_debug("==> cursor placed on key [%s]", DKEY(key)); + + return rc; +} + +/** Move the cursor to the first item in the database. */ +static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + return MDB_SUCCESS; + } + + if (likely(data)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + } + } + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the last item in the database. */ +static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + if (likely(!(mc->mc_flags & C_EOF))) { + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDB_PS_LAST); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + } + + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED | C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = + LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + if (likely(data)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) { + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor * mc, MDB_val * key, MDB_val * data); + + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + switch (op) { + case MDB_GET_CURRENT: + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } else { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (unlikely( + !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { + mdbx_xcursor_init1(mc, leaf); + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + break; + } + rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDB_GET_CURRENT); + } else { + rc = mdbx_node_read(mc, leaf, data); + } + } + } + } + break; + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (unlikely(data == NULL)) { + rc = EINVAL; + break; + } + if (unlikely(mc->mc_xcursor == NULL)) { + rc = MDB_INCOMPATIBLE; + break; + } + /* FALLTHRU */ + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (unlikely(key == NULL)) { + rc = EINVAL; + } else { + rc = mdbx_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { + rc = EINVAL; + break; + } + if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (unlikely(data == NULL)) { + rc = EINVAL; + break; + } + if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = mdbx_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; + fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; + data->mv_data = PAGEDATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top]) - 1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_PREV_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdbx_cursor_last(mc, key, data); + else + rc = MDB_SUCCESS; + if (rc == MDB_SUCCESS) { + MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + rc = mdbx_cursor_sibling(mx, 0); + if (rc == MDB_SUCCESS) + goto fetchm; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + rc = mdbx_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + rc = mdbx_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdbx_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdbx_cursor_first; + mmove: + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { + rc = EINVAL; + break; + } + if (unlikely(mc->mc_xcursor == NULL)) { + rc = MDB_INCOMPATIBLE; + break; + } + { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_GET_KEY(leaf, key); + rc = mdbx_node_read(mc, leaf, data); + break; + } + } + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdbx_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdbx_cursor_last; + goto mmove; + default: + mdbx_debug("unhandled/unimplemented cursor operation %u", op); + rc = EINVAL; + break; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + return rc; +} + +/** Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write + *operation. + * @param[in] mc The cursor to operate on. + */ +static int mdbx_cursor_touch(MDB_cursor *mc) { + int rc = MDB_SUCCESS; + + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY | DB_DUPDATA))) { + /* Touch DB record of named DB */ + MDB_cursor mc2; + MDB_xcursor mcx; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (unlikely(rc)) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdbx_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum - 1; + } + return rc; +} + +/** Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDB_NOSPILL 0x8000 + +int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned flags) { + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp, *sub_root = NULL; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert_key, insert_data; + unsigned mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned nflags; + DKBUF; + + if (unlikely(mc == NULL || key == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + env = mc->mc_txn->mt_env; + + /* Check this first so counter will always be zero on any + * early failures. + */ + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))) + return MDB_INCOMPATIBLE; + } + + if (flags & MDB_RESERVE) { + if (unlikely(mc->mc_db->md_flags & (MDB_DUPSORT | MDB_REVERSEDUP))) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (unlikely(key->mv_size > ENV_MAXKEY(env))) + return MDB_BAD_VALSIZE; + +#if SIZE_MAX > MAXDATASIZE + if (unlikely(data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) + ? ENV_MAXKEY(env) + : MAXDATASIZE))) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + unlikely(data->mv_size > ENV_MAXKEY(env))) + return MDB_BAD_VALSIZE; +#endif + + if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && + unlikely(key->mv_size != sizeof(unsigned) && + key->mv_size != sizeof(size_t))) { + mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); + return MDB_BAD_VALSIZE; + } + + if ((mc->mc_db->md_flags & MDB_INTEGERDUP) && + unlikely(data->mv_size != sizeof(unsigned) && + data->mv_size != sizeof(size_t))) { + mdbx_cassert(mc, !"data-size is invalid MDB_INTEGERDUP"); + return MDB_BAD_VALSIZE; + } + + mdbx_debug("==> put db %d key [%s], size %zu, data size %zu", DDBI(mc), + DKEY(key), key ? key->mv_size : 0, data->mv_size); + + int dupdata_flag = 0; + if (flags & MDB_CURRENT) { + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return EINVAL; +#if MDBX_MODE_ENABLED + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_cassert(mc, + mc->mc_xcursor != NULL && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); + if (mc->mc_xcursor->mx_db.md_entries > 1) { + rc = mdbx_cursor_del(mc, 0); + if (rc != MDB_SUCCESS) + return rc; + flags -= MDB_CURRENT; + } + } + } +#endif /* MDBX_MODE_ENABLED */ + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + /* new database, cursor has nothing to point to */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; + } else { + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdbx_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + /* new key is <= last key */ + rc = MDB_KEYEXIST; + } + } + } else { + rc = mdbx_cursor_set(mc, key, &d2, MDB_SET, &exact); + } + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + mdbx_debug("duplicate key [%s]", DKEY(key)); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && unlikely(rc != MDB_NOTFOUND)) + return rc; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if (unlikely(rc2 = mdbx_page_spill(mc, key, rdata))) + return rc2; + } + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + /* new database, write a root leaf page */ + mdbx_debug("allocating new root leaf page"); + if (unlikely(rc2 = mdbx_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdbx_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT | MDB_DUPFIXED)) == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + rc2 = mdbx_cursor_touch(mc); + if (unlikely(rc2)) + return rc2; + } + + insert_key = insert_data = rc; + if (insert_key) { + /* The key does not exist */ + mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for prep_subDB to expand to a full page. + */ + fp_flags = P_LEAF | P_DIRTY; + fp = env->me_pbuf; + fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); + olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; + unsigned ksize = mc->mc_db->md_xsize; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); + fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page + */ + if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + unsigned short dtop = 1; + mc->mc_top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + mc->mc_top--; + dtop++; + } + if (mc->mc_ki[mc->mc_top]) + rc2 = mdbx_update_key(mc, key); + else + rc2 = MDB_SUCCESS; + mc->mc_top += dtop; + if (rc2) + return rc2; + } + return MDB_SUCCESS; + } + + more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, + * if needed. fp: old sub-page or a header faking + * it. mp: new (sub-)page. offset: growth in page + * size. xdata: node data with new page or DB. + */ + unsigned i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + /* Just overwrite the current item */ + if (flags & MDB_CURRENT) { + if ((flags & MDB_NODUPDATA) && !mc->mc_dbx->md_dcmp(data, &olddata)) + return MDB_KEYEXIST; + goto current; + } + + /* does data match? */ + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { + if (unlikely(flags & (MDB_NODUPDATA | MDB_APPENDDUP))) + return MDB_KEYEXIST; + /* overwrite it */ + goto current; + } + + /* Back up original data item */ + dupdata_flag = 1; + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp + 1, olddata.mv_data, olddata.mv_size); + + /* Make sub-page header for the dup items, with dummy body */ + fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; + fp->mp_lower = (PAGEHDRSZ - PAGEBASE); + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_leaf2_ksize = data->mv_size; + xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + fp->mp_upper = xdata.mv_size - PAGEBASE; + olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ + } else if (leaf->mn_flags & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA | F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.mv_data; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = EVEN(NODESIZE + sizeof(indx_t) + data->mv_size); + break; + } + offset = fp->mp_leaf2_ksize; + if (SIZELEFT(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ + case MDB_CURRENT | MDB_NODUPDATA: + case MDB_CURRENT: + fp->mp_flags |= P_DIRTY; + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; + prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_xsize = fp->mp_leaf2_ksize; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_xsize = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA | F_SUBDATA; + dummy.md_root = mp->mp_pgno; + sub_root = mp; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; + mp->mp_lower = fp->mp_lower; + mp->mp_upper = fp->mp_upper + offset; + if (fp_flags & P_LEAF2) { + memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); + } else { + memcpy((char *)mp + mp->mp_upper + PAGEBASE, + (char *)fp + fp->mp_upper + PAGEBASE, + olddata.mv_size - fp->mp_upper - PAGEBASE); + for (i = 0; i < NUMKEYS(fp); i++) + mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert_key) + mdbx_node_del(mc, 0); + goto new_sub; + } + current: + /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ + if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) + return MDB_INCOMPATIBLE; + /* overflow page overwrites need special handling */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if (unlikely((rc2 = mdbx_page_get(mc, pg, &omp, &level)) != 0)) + return rc2; + ovpages = omp->mp_pages; + + /* Is the ov page large enough? */ + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) { + rc = mdbx_page_unspill(mc->mc_txn, omp, &omp); + if (unlikely(rc)) + return rc; + level = 0; /* dirty in this txn or clean */ + } + /* Is it dirty? */ + if (omp->mp_flags & P_DIRTY) { + /* yes, overwrite it. Note in this case we don't + * bother to try shrinking the page if the new data + * is smaller than the overflow threshold. + */ + if (unlikely(level > 1)) { + /* It is writable only in a parent txn */ + MDB_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (unlikely(!np)) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + /* Note - this page is already counted in parent's dirty_room */ + rc2 = mdbx_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdbx_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ + size_t sz = (size_t)env->me_psize * ovpages, off; + if (MDBX_MODE_ENABLED || !(flags & MDB_RESERVE)) { + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. + */ + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); /* Copy whole or header of page */ + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = PAGEDATA(omp); + else + memcpy(PAGEDATA(omp), data->mv_data, data->mv_size); + return MDB_SUCCESS; + } + } + if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. + */ + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else { + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto fix_parent; + } + return MDB_SUCCESS; + } + mdbx_node_del(mc, 0); + } + + rdata = data; + +new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size + : mdbx_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if ((flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) + nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + if (!insert_key) + nflags |= MDB_SPLIT_REPLACE; + rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + /* There is room already in this leaf page. */ + rc = mdbx_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (likely(rc == 0)) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) + continue; + if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { + m3->mc_ki[i]++; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); + } + } + } + + if (likely(rc == MDB_SUCCESS)) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. */ + if (do_sub) { + int xflags; + size_t ecount; + put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (flags & MDB_CURRENT) { + xflags = (flags & MDB_NODUPDATA) + ? MDB_CURRENT | MDB_NOOVERWRITE | MDB_NOSPILL + : MDB_CURRENT | MDB_NOSPILL; + } else { + mdbx_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE | MDB_NOSPILL + : MDB_NOSPILL; + } + if (sub_root) + mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; + /* converted, write the original data first */ + if (dupdata_flag) { + rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (unlikely(rc)) + goto bad_sub; + /* we've done our job */ + dkey.mv_size = 0; + } + if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + MDB_xcursor *mx = mc->mc_xcursor; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) + continue; + if (!(m2->mc_flags & C_INITIALIZED)) + continue; + if (m2->mc_pg[i] == mp) { + if (m2->mc_ki[i] == mc->mc_ki[i]) { + mdbx_xcursor_init2(m2, mx, dupdata_flag); + } else if (!insert_key && m2->mc_ki[i] < nkeys) { + XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); + } + } + } + } + ecount = mc->mc_xcursor->mx_db.md_entries; + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (unlikely(rc)) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. */ + mc->mc_flags |= C_INITIALIZED; + } + if (flags & MDB_MULTIPLE) { + if (!rc) { + mcount++; + /* let caller know how many succeeded, if any */ + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + insert_key = insert_data = 0; + goto more; + } + } + } + return rc; + bad_sub: + if (unlikely(rc == + MDB_KEYEXIST)) /* should not happen, we deleted that item */ + rc = MDB_PROBLEM; + } + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { + MDB_node *leaf; + MDB_page *mp; + int rc; + + if (unlikely(!mc)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return EINVAL; + + if (unlikely(mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))) + return MDB_NOTFOUND; + + if (unlikely(!(flags & MDB_NOSPILL) && + (rc = mdbx_page_spill(mc, NULL, NULL)))) + return rc; + + rc = mdbx_cursor_touch(mc); + if (unlikely(rc)) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + if (IS_LEAF2(mp)) + goto del_key; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (flags & MDB_NODUPDATA) { + /* mdbx_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } else { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + if (unlikely(rc)) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + /* update subDB info */ + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + /* shrink fake page */ + mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + /* fix other sub-DB cursors pointed at fake pages on this page */ + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) + continue; + if (!(m2->mc_flags & C_INITIALIZED)) + continue; + if (m2->mc_pg[mc->mc_top] == mp) { + MDB_node *n2 = leaf; + if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { + n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); + if (n2->mn_flags & F_SUBDATA) + continue; + } + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } + } + } + mc->mc_db->md_entries--; + return rc; + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } + /* otherwise fall thru and delete the sub-DB */ + } + + if (leaf->mn_flags & F_SUBDATA) { + /* add all the child DB's pages to the free list */ + rc = mdbx_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (unlikely(rc)) + goto fail; + } + } + /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ + else if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) { + rc = MDB_INCOMPATIBLE; + goto fail; + } + + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if (unlikely((rc = mdbx_page_get(mc, pg, &omp, NULL)) || + (rc = mdbx_ovpage_free(mc, omp)))) + goto fail; + } + +del_key: + return mdbx_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc a cursor on the database being added to. + * @param[in] flags flags defining what type of page is being allocated. + * @param[in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * @param[out] mp Address of a page, or NULL on failure. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, + MDB_page **mp) { + MDB_page *np; + int rc; + + if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) + return rc; + mdbx_debug("allocated new mpage %zu, page size %u", np->mp_pgno, + mc->mc_txn->mt_env->me_psize); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = (PAGEHDRSZ - PAGEBASE); + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; + + return 0; +} + +/** Calculate the size of a leaf node. + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @param[in] data The data for the node. + * @return The number of bytes needed to store the node. + */ +static MDBX_INLINE size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, + MDB_val *data) { + size_t sz; + + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + /* put on overflow page */ + sz -= data->mv_size - sizeof(pgno_t); + } + + return EVEN(sz + sizeof(indx_t)); +} + +/** Calculate the size of a branch node. + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the #MDB_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @return The number of bytes needed to store the node. + */ +static MDBX_INLINE size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { + size_t sz; + + sz = INDXSIZE(key); + if (unlikely(sz > env->me_nodemax)) { + /* put on overflow page */ + /* not implemented */ + mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __FUNCTION__, + __LINE__); + sz -= key->mv_size - sizeof(pgno_t); + } + + return sz + sizeof(indx_t); +} + +/** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc The cursor for this operation. + * @param[in] indx The index on the page where the new node should be added. + * @param[in] key The key for the new node. + * @param[in] data The data for the new node, if any. + * @param[in] pgno The page number, if adding a branch node. + * @param[in] flags Flags for the node. + * @return 0 on success, non-zero on failure. Possible errors are: + *
    + *
  • ENOMEM - failed to allocate overflow pages for the node. + *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error + * should never happen since all callers already calculate the + * page's free space before calling this function. + *
+ */ +static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, + MDB_val *data, pgno_t pgno, unsigned flags) { + unsigned i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; /* overflow page */ + void *ndata; + DKBUF; + + mdbx_cassert(mc, mp->mp_upper >= mp->mp_lower); + + mdbx_debug("add to %s %spage %zu index %i, data size %zu key size %zu [%s]", + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdbx_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null"); + + if (IS_LEAF2(mp)) { + mdbx_cassert(mc, key); + /* Move higher keys up one slot. */ + int ksize = mc->mc_db->md_xsize, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr + ksize, ptr, dif * ksize); + /* insert new key */ + memcpy(ptr, key->mv_data, ksize); + + /* Just using these for counting */ + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } + + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdbx_cassert(mc, key && data); + if (unlikely(F_ISSET(flags, F_BIGDATA))) { + /* Data already on overflow page. */ + node_size += sizeof(pgno_t); + } else if (unlikely(node_size + data->mv_size > + mc->mc_txn->mt_env->me_nodemax)) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + /* Put data on overflow page. */ + mdbx_debug( + "data size is %zu, node would be %zu, put data on overflow page", + data->mv_size, node_size + data->mv_size); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdbx_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + mdbx_debug("allocated overflow page %zu", ofp->mp_pgno); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); + if (unlikely((ssize_t)node_size > room)) + goto full; + +update: + /* Move higher pointers up one slot. */ + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + ofs = mp->mp_upper - node_size; + mdbx_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); + + /* Write the node data. */ + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node, data->mv_size); + else + SETPGNO(node, pgno); + + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + if (IS_LEAF(mp)) { + ndata = NODEDATA(node); + if (unlikely(ofp == NULL)) { + if (unlikely(F_ISSET(flags, F_BIGDATA))) + memcpy(ndata, data->mv_data, sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else if (likely(ndata != data->mv_data)) + memcpy(ndata, data->mv_data, data->mv_size); + } else { + memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); + ndata = PAGEDATA(ofp); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else if (likely(ndata != data->mv_data)) + memcpy(ndata, data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + +full: + mdbx_debug("not enough room in page %zu, got %u ptrs", mdbx_dbg_pgno(mp), + NUMKEYS(mp)); + mdbx_debug("upper-lower = %u - %u = %zd", mp->mp_upper, mp->mp_lower, room); + mdbx_debug("node size = %zu", node_size); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; +} + +/** Delete the specified node from a page. + * @param[in] mc Cursor pointing to the node to delete. + * @param[in] ksize The size of a node. Only used if the page is + * part of a #MDB_DUPFIXED database. + */ +static void mdbx_node_del(MDB_cursor *mc, int ksize) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + mdbx_debug("delete node %u on %s page %zu", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdbx_dbg_pgno(mp)); + numkeys = NUMKEYS(mp); + mdbx_cassert(mc, indx < numkeys); + + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += ksize - sizeof(indx_t); + return; + } + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); + + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + sz, base, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; +} + +/** Compact the main page after deleting a node on a subpage. + * @param[in] mp The main page to operate on. + * @param[in] indx The index of the subpage on the main page. + */ +static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { + MDB_node *node; + MDB_page *sp, *xp; + char *base; + indx_t delta, nsize, len, ptr; + int i; + + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + nsize = NODEDSZ(node) - delta; + + /* Prepare to shift upward, set len = length(subpage part to shift) */ + if (IS_LEAF2(sp)) { + len = nsize; + if (nsize & 1) + return; /* do not make the node uneven-sized */ + } else { + xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ + for (i = NUMKEYS(sp); --i >= 0;) + xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + len = PAGEHDRSZ; + } + sp->mp_upper = sp->mp_lower; + COPY_PGNO(sp->mp_pgno, mp->mp_pgno); + SETDSZ(node, nsize); + + /* Shift upward */ + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, (char *)sp + len - base); + + ptr = mp->mp_ptrs[indx]; + for (i = NUMKEYS(mp); --i >= 0;) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } + mp->mp_upper += delta; +} + +/** Initial setup of a sorted-dups cursor. + * Sorted duplicates are implemented as a sub-database for the given key. + * The duplicate data items are actually keys of the sub-database. + * Operations on the duplicate data items are performed using a sub-cursor + * initialized when the sub-database is first accessed. This function does + * the preliminary setup of the sub-cursor, filling in the fields that + * depend only on the parent DB. + * @param[in] mc The main cursor whose sorted-dups cursor is to be + * initialized. + */ +static void mdbx_xcursor_init0(MDB_cursor *mc) { + MDB_xcursor *mx = mc->mc_xcursor; + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; +} + +/** Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * @param[in] mc The main cursor whose sorted-dups cursor is to be + *initialized. + * @param[in] node The data containing the #MDB_db record for the + * sorted-dup database. + */ +static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node) { + MDB_xcursor *mx = mc->mc_xcursor; + + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_xsize = 0; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED | C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_xsize = fp->mp_leaf2_ksize; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + mdbx_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); + mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; + /* #if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdbx_cmp_int && mx->mx_db.md_pad == + sizeof(size_t)) + mx->mx_dbx.md_cmp = mdbx_cmp_clong; + #endif */ +} + +/** Fixup a sorted-dups cursor due to underlying update. + * Sets up some fields that depend on the data from the main cursor. + * Almost the same as init1, but skips initialization steps if the + * xcursor had already been used. + * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. + * @param[in] src_mx The xcursor of an up-to-date cursor. + * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. + */ +static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, + int new_dupdata) { + MDB_xcursor *mx = mc->mc_xcursor; + + if (new_dupdata) { + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_ki[0] = 0; + mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; + mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; + } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { + return; + } + mx->mx_db = src_mx->mx_db; + mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; + mdbx_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); +} + +/** Initialize a cursor for a given transaction and database. */ +static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, + MDB_xcursor *mx) { + mc->mc_signature = MDBX_MC_SIGNATURE; + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_flags = 0; + mc->mc_ki[0] = 0; + mc->mc_xcursor = NULL; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdbx_tassert(txn, mx != NULL); + mx->mx_cursor.mc_signature = MDBX_MC_SIGNATURE; + mc->mc_xcursor = mx; + mdbx_xcursor_init0(mc); + } + if (unlikely(*mc->mc_dbflag & DB_STALE)) { + mdbx_page_search(mc, NULL, MDB_PS_ROOTONLY); + } +} + +int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); + + if (unlikely(!ret || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + return EINVAL; + + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); + + if (likely((mc = malloc(size)) != NULL)) { + mdbx_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; + } + + *ret = mc; + + return MDB_SUCCESS; +} + +int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { + if (unlikely(!mc || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE && + mc->mc_signature != MDBX_MC_READY4CLOSE)) + return EINVAL; + + if (unlikely(!TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))) + return EINVAL; + + if (unlikely(mc->mc_backup)) + return EINVAL; + + if (unlikely((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)) { +#if MDBX_MODE_ENABLED + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + mc->mc_signature = MDBX_MC_READY4CLOSE; +#else + return EINVAL; +#endif + } + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + mdbx_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; +} + +/* Return the count of duplicate data items for the current key */ +int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { + if (unlikely(mc == NULL || countp == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return EINVAL; + +#if MDBX_MODE_ENABLED + if (!mc->mc_snum) { + *countp = 0; + return MDB_NOTFOUND; + } + + MDB_page *mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) { + *countp = 0; + return MDB_NOTFOUND; + } + + *countp = 1; + if (mc->mc_xcursor != NULL) { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & + C_INITIALIZED)); + *countp = mc->mc_xcursor->mx_db.md_entries; + } + } +#else + if (unlikely(mc->mc_xcursor == NULL)) + return MDB_INCOMPATIBLE; + + if (!mc->mc_snum) + return MDB_NOTFOUND; + + MDB_page *mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) + return MDB_NOTFOUND; + + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) + return EINVAL; + *countp = mc->mc_xcursor->mx_db.md_entries; + } +#endif /* MDBX_MODE_ENABLED */ + return MDB_SUCCESS; +} + +void mdbx_cursor_close(MDB_cursor *mc) { + if (mc) { + mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_SIGNATURE || + mc->mc_signature == MDBX_MC_READY4CLOSE); + if (!mc->mc_backup) { + /* Remove from txn, if tracked. + * A read-only txn (!C_UNTRACK) may have been freed already, + * so do not peek inside it. Only write txns track cursors. */ + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + mc->mc_signature = 0; + free(mc); + } else { + /* cursor closed before nested txn ends */ + mdbx_cassert(mc, mc->mc_signature == MDBX_MC_SIGNATURE); + mc->mc_signature = MDBX_MC_WAIT4EOT; + } + } +} + +MDB_txn *mdbx_cursor_txn(MDB_cursor *mc) { + if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) + return NULL; + return mc->mc_txn; +} + +MDB_dbi mdbx_cursor_dbi(MDB_cursor *mc) { + if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) + return INT_MIN; + return mc->mc_dbi; +} + +/** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc Cursor pointing to the node to operate on. + * @param[in] key The new key to use. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; + + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %zu", indx, ptr, + mdbx_dkey(&k2, kbuf2), DKEY(key), mp->mp_pgno); + } + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; + + /* Shift node contents if EVEN(key length) changed. */ + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + /* not enough space left, do a delete and split */ + mdbx_debug("Not enough room, delta = %d, splitting...", delta); + pgno = NODEPGNO(node); + mdbx_node_del(mc, 0); + return mdbx_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + } + + /* But even if no shift was needed, update ksize */ + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; + + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + return MDB_SUCCESS; +} + +static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); + +/** Perform \b act while tracking temporary cursor \b mn */ +#define WITH_CURSOR_TRACKING(mn, act) \ + do { \ + MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + if ((mn).mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + tracked = &dummy; \ + } else { \ + tracked = &(mn); \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ + } while (0) + +/** Move a node from csrc to cdst. + */ +static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; + + DKBUF; + + /* Mark src and dst as dirty. */ + if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + return rc; + + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_xsize; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], + key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdbx_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && + IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned snum = csrc->mc_snum; + MDB_node *s2; + /* must find the lowest key below src */ + rc = mdbx_page_search_lowest(csrc); + if (unlikely(rc)) + return rc; + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_xsize; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + } + mn.mc_xcursor = NULL; + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + /* must find the lowest key below dst */ + mdbx_cursor_copy(cdst, &mn); + rc = mdbx_page_search_lowest(&mn); + if (unlikely(rc)) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_xsize; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + mn.mc_snum = snum--; + mn.mc_top = snum; + mn.mc_ki[snum] = 0; + rc = mdbx_update_key(&mn, &bkey); + if (unlikely(rc)) + return rc; + } + + mdbx_debug("moving %s node %u [%s] on page %zu to node %u on page %zu", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, cdst->mc_ki[cdst->mc_top], + cdst->mc_pg[cdst->mc_top]->mp_pgno); + + /* Add the node to the destination page. */ + rc = + mdbx_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + /* Delete the node from the source page. */ + mdbx_node_del(csrc, key.mv_size); + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mpd, *mps; + + mps = csrc->mc_pg[csrc->mc_top]; + /* If we're adding on the left, bump others up */ + if (fromleft) { + mpd = cdst->mc_pg[csrc->mc_top]; + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3 != cdst && m3->mc_pg[csrc->mc_top] == mpd && + m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { + m3->mc_ki[csrc->mc_top]++; + } + if (m3 != csrc && m3->mc_pg[csrc->mc_top] == mps && + m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top - 1]++; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + } + } else + /* Adding on the right, bump others down */ + { + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) + continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3->mc_pg[csrc->mc_top] == mps) { + if (!m3->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top - 1]--; + } else { + m3->mc_ki[csrc->mc_top]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], + m3->mc_ki[csrc->mc_top]); + } + } + } + } + + /* Update the parent separators. */ + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top - 1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + mdbx_debug("update separator for source page %zu to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); + mdbx_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdbx_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdbx_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top - 1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + mdbx_debug("update separator for destination page %zu to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); + mdbx_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdbx_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdbx_cassert(cdst, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; +} + +/** Merge one page into another. + * The nodes from the page pointed to by \b csrc will + * be copied to the page pointed to by \b cdst and then + * the \b csrc page will be freed. + * @param[in] csrc Cursor pointing to the source page. + * @param[in] cdst Cursor pointing to the destination page. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { + MDB_page *psrc, *pdst; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + int rc; + indx_t i, j; + + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + mdbx_debug("merging page %zu into %zu", psrc->mp_pgno, pdst->mp_pgno); + + mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdbx_cassert(csrc, cdst->mc_snum > 1); + + /* Mark dst as dirty. */ + if (unlikely(rc = mdbx_page_touch(cdst))) + return rc; + + /* get dst page again now that we've touched it. */ + pdst = cdst->mc_pg[cdst->mc_top]; + + /* Move all nodes from src to dst. + */ + j = nkeys = NUMKEYS(pdst); + if (IS_LEAF2(psrc)) { + key.mv_size = csrc->mc_db->md_xsize; + key.mv_data = PAGEDATA(psrc); + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + rc = mdbx_node_add(cdst, j, &key, NULL, 0, 0); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + srcnode = NODEPTR(psrc, i); + if (i == 0 && IS_BRANCH(psrc)) { + MDB_cursor mn; + MDB_node *s2; + mdbx_cursor_copy(csrc, &mn); + mn.mc_xcursor = NULL; + /* must find the lowest key below src */ + rc = mdbx_page_search_lowest(&mn); + if (unlikely(rc)) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_xsize; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdbx_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), + srcnode->mn_flags); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } + + mdbx_debug("dst page %zu now has %u keys (%.1f%% filled)", pdst->mp_pgno, + NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); + + /* Unlink the src page from parent and add to free list. + */ + csrc->mc_top--; + mdbx_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdbx_update_key(csrc, &key); + if (unlikely(rc)) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + psrc = csrc->mc_pg[csrc->mc_top]; + /* If not operating on FreeDB, allow this page to be reused + * in this txn. Otherwise just add to free list. + */ + rc = mdbx_page_loose(csrc, psrc); + if (unlikely(rc)) + return rc; + if (IS_LEAF(psrc)) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + unsigned top = csrc->mc_top; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) + continue; + if (m3->mc_snum < csrc->mc_snum) + continue; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + m3->mc_ki[top] += nkeys; + m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; + } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && + m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) { + m3->mc_ki[top - 1]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); + } + } + { + unsigned snum = cdst->mc_snum; + uint16_t depth = cdst->mc_db->md_depth; + mdbx_cursor_pop(cdst); + rc = mdbx_rebalance(cdst); + /* Did the tree height change? */ + if (depth != cdst->mc_db->md_depth) + snum += cdst->mc_db->md_depth - depth; + cdst->mc_snum = snum; + cdst->mc_top = snum - 1; + } + return rc; +} + +/** Copy the contents of a cursor. + * @param[in] csrc The cursor to copy from. + * @param[out] cdst The cursor to copy to. + */ +static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) { + unsigned i; + + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (i = 0; i < csrc->mc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } +} + +/** Rebalance the tree after a delete operation. + * @param[in] mc Cursor pointing to the page where rebalancing + * should begin. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_rebalance(MDB_cursor *mc) { + MDB_node *node; + int rc, fromleft; + unsigned ptop, minkeys, thresh; + MDB_cursor mn; + indx_t oldki; + + if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { + minkeys = 2; + thresh = 1; + } else { + minkeys = 1; + thresh = FILL_THRESHOLD; + } + mdbx_debug("rebalancing %s page %zu (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdbx_dbg_pgno(mc->mc_pg[mc->mc_top]), + NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10); + + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + mdbx_debug("no need to rebalance page %zu, above fill threshold", + mdbx_dbg_pgno(mc->mc_pg[mc->mc_top])); + return MDB_SUCCESS; + } + + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + mdbx_debug("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } + if (NUMKEYS(mp) == 0) { + mdbx_debug("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdbx_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (unlikely(rc)) + return rc; + /* Adjust cursors pointing to mp */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + int i; + mdbx_debug("collapsing root page!"); + rc = mdbx_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (unlikely(rc)) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); + if (unlikely(rc)) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + for (i = 1; i < mc->mc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i + 1]; + mc->mc_ki[i] = mc->mc_ki[i + 1]; + } + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_pg[0] == mp) { + for (i = 0; i < mc->mc_db->md_depth; i++) { + m3->mc_pg[i] = m3->mc_pg[i + 1]; + m3->mc_ki[i] = m3->mc_ki[i + 1]; + } + m3->mc_snum--; + m3->mc_top--; + } + } + } + } else + mdbx_debug("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. + */ + ptop = mc->mc_top - 1; + mdbx_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. + */ + + /* Find neighbors. + */ + mdbx_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + oldki = mc->mc_ki[mc->mc_top]; + if (mc->mc_ki[ptop] == 0) { + /* We're the leftmost leaf in our parent. + */ + mdbx_debug("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (unlikely(rc)) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + fromleft = 0; + } else { + /* There is at least one neighbor to the left. + */ + mdbx_debug("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (unlikely(rc)) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + fromleft = 1; + } + + mdbx_debug("found neighbor page %zu (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); + + /* If the neighbor page is above threshold and has enough keys, + * move one key from it. Otherwise we should try to merge them. + * (A branch page must never have less than 2 keys.) + */ + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && + NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdbx_node_move(&mn, mc, fromleft); + if (fromleft) { + /* if we inserted on left, bump position up */ + oldki++; + } + } else { + if (!fromleft) { + rc = mdbx_page_merge(&mn, mc); + } else { + oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + mdbx_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~C_EOF; + } + mc->mc_ki[mc->mc_top] = oldki; + return rc; +} + +/** Complete a delete operation started by #mdbx_cursor_del(). */ +static int mdbx_cursor_del0(MDB_cursor *mc) { + int rc; + MDB_page *mp; + indx_t ki; + unsigned nkeys; + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdbx_node_del(mc, mc->mc_db->md_xsize); + mc->mc_db->md_entries--; + { + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + } + rc = mdbx_rebalance(mc); + + if (likely(rc == MDB_SUCCESS)) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by mdbx_rebalance and aren't needed here. + */ + if (!mc->mc_snum) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2 = m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { + MDB_node *node = + NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node is a fake page, it needs to be reinited + * because its data has moved. But just reset mc_pg[0] + * if the xcursor is already live. + */ + if ((node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + else + mdbx_xcursor_init1(m3, node); + } + } + } + } + } + mc->mc_flags |= C_DEL; + } + + if (unlikely(rc)) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { + if (unlikely(!key || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + +#if !MDBX_MODE_ENABLED + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + /* must ignore any data */ + data = NULL; + } +#endif + + return mdbx_del0(txn, dbi, key, data, 0); +} + +static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags) { + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata; + int rc, exact = 0; + DKBUF; + + mdbx_debug("====> delete db %u key [%s]", dbi, DKEY(key)); + + mdbx_cursor_init(&mc, txn, dbi, &mx); + + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + data = &rdata; + } else { + op = MDB_SET; + flags |= MDB_NODUPDATA; + } + rc = mdbx_cursor_set(&mc, key, data, op, &exact); + if (likely(rc == 0)) { + /* let mdbx_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. + */ + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdbx_cursor_del(&mc, flags); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; +} + +/** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. + * @param[in,out] mc Cursor pointing to the page and desired insertion index. + * The cursor will be updated to point to the actual page and index where + * the node got inserted after the split. + * @param[in] newkey The key for the newly inserted node. + * @param[in] newdata The data for the newly inserted node. + * @param[in] newpgno The page number, if the new node is a branch node. + * @param[in] nflags The #NODE_ADD_FLAGS for the new node. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned nflags) { + unsigned flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; + + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); + + mdbx_debug("-----> splitting %s page %zu and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), + mc->mc_ki[mc->mc_top], nkeys); + + /* Create a right sibling. */ + if ((rc = mdbx_page_new(mc, mp->mp_flags, 1, &rp))) + return rc; + rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; + mdbx_debug("new right sibling: page %zu", rp->mp_pgno); + + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdbx_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { + if ((rc = mdbx_page_new(mc, P_BRANCH, 1, &pp))) + goto done; + /* shift current top to make room for new parent */ + for (i = mc->mc_snum; i > 0; i--) { + mc->mc_pg[i] = mc->mc_pg[i - 1]; + mc->mc_ki[i] = mc->mc_ki[i - 1]; + } + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + mdbx_debug("root split! new root = %zu", pp->mp_pgno); + new_root = mc->mc_db->md_depth++; + + /* Add left (implicit) pointer. */ + if (unlikely((rc = mdbx_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != + MDB_SUCCESS)) { + /* undo the pre-push */ + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + goto done; + } + mc->mc_snum++; + mc->mc_top++; + ptop = 0; + } else { + ptop = mc->mc_top - 1; + mdbx_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); + } + + mdbx_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + split_indx = (nkeys + 1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_xsize; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x < 0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + } + } else { + int psize, nsize, k; + /* Maximum free space in an empty page */ + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdbx_leaf_size(env, newkey, newdata); + else + nsize = mdbx_branch_size(env, newkey); + nsize = EVEN(nsize); + + /* grab a page to hold a temporary copy */ + copy = mdbx_page_malloc(mc->mc_txn, 1); + if (unlikely(copy == NULL)) { + rc = ENOMEM; + goto done; + } + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = (PAGEHDRSZ - PAGEBASE); + copy->mp_upper = env->me_psize - PAGEBASE; + + /* prepare to insert */ + for (i = 0, j = 0; i < nkeys; i++) { + if (i == newindx) { + copy->mp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdbx_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large." If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. + */ + if (nkeys < 20 || nsize > pmax / 16 || newindx >= nkeys) { + /* Find split point */ + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; + j = 1; + k = newindx >= nkeys ? nkeys : split_indx + 1 + IS_LEAF(mp); + } else { + i = nkeys; + j = -1; + k = split_indx - 1; + } + for (; i != k; i += j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k - j) { + split_indx = i + (j < 0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + mdbx_debug("separator is %d [%s]", split_indx, DKEY(&sepkey)); + + /* Copy separator key to the parent. */ + if (SIZELEFT(mn.mc_pg[ptop]) < mdbx_branch_size(env, &sepkey)) { + int snum = mc->mc_snum; + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + /* We want other splits to find mn when doing fixups */ + WITH_CURSOR_TRACKING( + mn, rc = mdbx_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); + if (unlikely(rc != MDB_SUCCESS)) + goto done; + + /* root split? */ + if (mc->mc_snum > snum) { + ptop++; + } + /* Right page might now have changed parent. + * Check if left page also changed parent. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i = 0; i < ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + rc = mdbx_cursor_sibling(mc, 0); + } + } + } else { + mn.mc_top--; + rc = mdbx_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; + } + if (unlikely(rc != MDB_SUCCESS)) { + if (rc == MDB_NOTFOUND) /* improper mdbx_cursor_sibling() result */ + rc = MDB_PROBLEM; + goto done; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdbx_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + goto done; + for (i = 0; i < mc->mc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } + + rc = mdbx_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) + goto done; + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i = 0; i < nkeys; i++) + mp->mp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1), + env->me_psize - copy->mp_upper - PAGEBASE); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i = 0; i <= ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i = 0; i <= ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + } + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (new_root) { + int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; + /* root split */ + for (k = new_root; k >= 0; k--) { + m3->mc_ki[k + 1] = m3->mc_ki[k]; + m3->mc_pg[k + 1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= nkeys) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= nkeys) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i = 0; i < mc->mc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } + } + } else if (!did_split && m3->mc_top >= ptop && + m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mp)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + mdbx_debug("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)); + +done: + if (copy) /* tmp page */ + mdbx_page_free(env, copy); + if (unlikely(rc)) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags) { + MDB_cursor mc; + MDB_xcursor mx; + + if (unlikely(!key || !data || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(flags & + ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | + MDB_APPENDDUP + /* LY: MDB_CURRENT indicates explicit overwrite (update) + for MDBX */ + | (MDBX_MODE_ENABLED ? MDB_CURRENT : 0)))) + return EINVAL; + + if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + mdbx_cursor_init(&mc, txn, dbi, &mx); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + int rc = MDB_SUCCESS; +#if MDBX_MODE_ENABLED + /* LY: support for update (explicit overwrite) */ + if (flags & MDB_CURRENT) { + rc = mdbx_cursor_get(&mc, key, NULL, MDB_SET); + if (likely(rc == MDB_SUCCESS) && + (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)) { + /* LY: allows update (explicit overwrite) only for unique keys */ + MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_tassert(txn, XCURSOR_INITED(&mc) && + mc.mc_xcursor->mx_db.md_entries > 1); + rc = MDBX_EMULTIVAL; + } + } + } +#endif /* MDBX_MODE_ENABLED */ + if (likely(rc == MDB_SUCCESS)) + rc = mdbx_cursor_put(&mc, key, data, flags); + txn->mt_cursors[dbi] = mc.mc_next; + + return rc; +} + +#ifndef MDB_WBUF +#define MDB_WBUF (1024 * 1024) +#endif +#define MDB_EOF 0x10 /**< #mdbx_env_copyfd1() is done reading */ + +/** State needed for a double-buffering compacting copy. */ +typedef struct mdbx_copy { + MDB_env *mc_env; + MDB_txn *mc_txn; + pthread_mutex_t mc_mutex; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + char *mc_wbuf[2]; + char *mc_over[2]; + int mc_wlen[2]; + int mc_olen[2]; + pgno_t mc_next_pgno; + HANDLE mc_fd; + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; +} mdbx_copy; + +/** Dedicated writer thread for compacting copy. */ +static void *__cold mdbx_env_copythr(void *arg) { + mdbx_copy *my = arg; + char *ptr; + int toggle = 0, wsize, rc = 0; + int len; + +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif + + pthread_mutex_lock(&my->mc_mutex); + for (;;) { + while (!my->mc_new) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + break; + wsize = my->mc_wlen[toggle]; + ptr = my->mc_wbuf[toggle]; + again: + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { + len = write(my->mc_fd, ptr, wsize); + if (len < 0) { + rc = errno; +#ifdef SIGPIPE + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + if (rc) { + my->mc_error = rc; + } + /* If there's an overflow page tail, write it too */ + if (my->mc_olen[toggle]) { + wsize = my->mc_olen[toggle]; + ptr = my->mc_over[toggle]; + my->mc_olen[toggle] = 0; + goto again; + } + my->mc_wlen[toggle] = 0; + toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; + pthread_cond_signal(&my->mc_cond); + } + pthread_mutex_unlock(&my->mc_mutex); + return NULL; +} + +/** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ +static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { + pthread_mutex_lock(&my->mc_mutex); + my->mc_new += adjust; + pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + pthread_mutex_unlock(&my->mc_mutex); + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; +} + +/** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ +static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { + MDB_cursor mc; + MDB_node *ni; + MDB_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc, toggle; + unsigned i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDB_SUCCESS; + + memset(&mc, 0, sizeof(mc)); + mc.mc_snum = 1; + mc.mc_txn = my->mc_txn; + + rc = mdbx_page_get(&mc, *pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + rc = mdbx_page_search_root(&mc, NULL, MDB_PS_FIRST); + if (rc) + return rc; + + /* Make cursor pages writable */ + buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + if (buf == NULL) + return ENOMEM; + + for (i = 0; i < mc.mc_top; i++) { + mdbx_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); + mc.mc_pg[i] = (MDB_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDB_page *)ptr; + + toggle = my->mc_toggle; + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i = 0; i < n; i++) { + ni = NODEPTR(mp, i); + if (ni->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); + rc = mdbx_page_get(&mc, pg, &omp, NULL); + if (rc) + goto done; + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdbx_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + rc = mdbx_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db db; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&db, NODEDATA(ni), sizeof(db)); + my->mc_toggle = toggle; + rc = mdbx_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + if (rc) + goto done; + toggle = my->mc_toggle; + memcpy(NODEDATA(ni), &db, sizeof(db)); + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; + again: + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdbx_page_get(&mc, pg, &mp, NULL); + if (rc) + goto done; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + mdbx_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); + goto again; + } else + mc.mc_pg[mc.mc_top] = mp; + continue; + } + } + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdbx_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdbx_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (mc.mc_top) { + /* Update parent if there is one */ + ni = NODEPTR(mc.mc_pg[mc.mc_top - 1], mc.mc_ki[mc.mc_top - 1]); + SETPGNO(ni, mo->mp_pgno); + mdbx_cursor_pop(&mc); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + free(buf); + return rc; +} + +/** Copy environment with compaction. */ +static int __cold mdbx_env_copyfd1(MDB_env *env, HANDLE fd) { + MDB_meta *mm; + MDB_page *mp; + mdbx_copy my; + MDB_txn *txn = NULL; + pthread_t thr; + pgno_t root, new_root; + int rc = MDB_SUCCESS; + + memset(&my, 0, sizeof(my)); + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; + my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF * 2); + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } + memset(my.mc_wbuf[0], 0, MDB_WBUF * 2); + my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_next_pgno = NUM_METAS; + my.mc_env = env; + my.mc_fd = fd; + rc = pthread_create(&thr, NULL, mdbx_env_copythr, &my); + if (rc) + goto done; + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + goto finish; + + mp = (MDB_page *)my.mc_wbuf[0]; + memset(mp, 0, NUM_METAS * env->me_psize); + mp->mp_pgno = 0; + mp->mp_flags = P_META; + mm = (MDB_meta *)PAGEDATA(mp); + mdbx_env_init_meta0(env, mm); + mm->mm_address = METAPAGE_1(env)->mm_address; + + mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp->mp_pgno = 1; + mp->mp_flags = P_META; + *(MDB_meta *)PAGEDATA(mp) = *mm; + mm = (MDB_meta *)PAGEDATA(mp); + + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ + MDB_ID freecount = 0; + MDB_cursor mc; + MDB_val key, data; + mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + + txn->mt_dbs[FREE_DBI].md_leaf_pages + + txn->mt_dbs[FREE_DBI].md_overflow_pages; + + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; + mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ + } + + my.mc_wlen[0] = env->me_psize * NUM_METAS; + my.mc_txn = txn; + rc = mdbx_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } + +finish: + if (rc) + my.mc_error = rc; + mdbx_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = pthread_join(thr, NULL); + mdbx_txn_abort(txn); + +done: + free(my.mc_wbuf[0]); + pthread_cond_destroy(&my.mc_cond); +done2: + pthread_mutex_destroy(&my.mc_mutex); + return rc ? rc : my.mc_error; +} + +/** Copy environment as-is. */ +static int __cold mdbx_env_copyfd0(MDB_env *env, HANDLE fd) { + MDB_txn *txn = NULL; + pthread_mutex_t *wmutex = NULL; + int rc; + size_t wsize; + char *ptr; + ssize_t len; + size_t w2; + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + /* We must start the actual read txn after blocking writers */ + rc = mdbx_txn_end(txn, MDB_END_RESET_TMP); + if (rc) + return rc; + + /* Temporarily block writers until we snapshot the meta pages */ + wmutex = MDB_MUTEX(env, w); + rc = mdbx_mutex_lock(env, wmutex); + if (unlikely(rc)) + goto leave; + + rc = mdbx_txn_renew0(txn, MDB_RDONLY); + if (rc) { + mdbx_mutex_unlock(env, wmutex); + goto leave; + } + + wsize = env->me_psize * NUM_METAS; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + len = write(fd, ptr, w2); + if (len < 0) { + rc = errno; + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + /* Non-blocking or async handles are not supported */ + rc = EIO; + break; + } + } + mdbx_mutex_unlock(env, wmutex); + + if (rc) + goto leave; + + w2 = txn->mt_next_pgno * env->me_psize; + { + size_t fsize = 0; + if ((rc = mdbx_fsize(env->me_fd, &fsize))) + goto leave; + if (w2 > fsize) + w2 = fsize; + } + wsize = w2 - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + len = write(fd, ptr, w2); + if (len < 0) { + rc = errno; + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdbx_txn_abort(txn); + return rc; +} + +int __cold mdbx_env_copyfd2(MDB_env *env, HANDLE fd, unsigned flags) { + if (flags & MDB_CP_COMPACT) + return mdbx_env_copyfd1(env, fd); + else + return mdbx_env_copyfd0(env, fd); +} + +int __cold mdbx_env_copyfd(MDB_env *env, HANDLE fd) { + return mdbx_env_copyfd2(env, fd, 0); +} + +int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { + int rc, len; + char *lpath; + HANDLE newfd = INVALID_HANDLE_VALUE; + + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ + newfd = open(lpath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0666); + if (newfd == INVALID_HANDLE_VALUE) { + rc = errno; + goto leave; + } + + int fdflags; + if ((fdflags = fcntl(newfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(newfd, F_SETFD, fdflags); + + if (env->me_psize >= env->me_os_psize) { +#ifdef F_NOCACHE /* __APPLE__ */ + (void)fcntl(newfd, F_NOCACHE, 1); +#elif defined O_DIRECT + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void)fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif + } + + rc = mdbx_env_copyfd2(env, newfd, flags); + +leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = errno; + + return rc; +} + +int __cold mdbx_env_copy(MDB_env *env, const char *path) { + return mdbx_env_copy2(env, path, 0); +} + +int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { + if (unlikely(flags & ~CHANGEABLE)) + return EINVAL; + + pthread_mutex_t *mutex = MDB_MUTEX(env, w); + int rc = mdbx_mutex_lock(env, mutex); + if (unlikely(rc)) + return rc; + + if (onoff) + env->me_flags |= flags; + else + env->me_flags &= ~flags; + + mdbx_mutex_unlock(env, mutex); + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_flags(MDB_env *env, unsigned *arg) { + if (unlikely(!env || !arg)) + return EINVAL; + + *arg = env->me_flags & (CHANGEABLE | CHANGELESS); + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_userctx(MDB_env *env, void *ctx) { + if (unlikely(!env)) + return EINVAL; + env->me_userctx = ctx; + return MDB_SUCCESS; +} + +void *__cold mdbx_env_get_userctx(MDB_env *env) { + return env ? env->me_userctx : NULL; +} + +int __cold mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func) { + if (unlikely(!env)) + return EINVAL; +#if MDB_DEBUG + env->me_assert_func = func; + return MDB_SUCCESS; +#else + (void)func; + return ENOSYS; +#endif +} + +int __cold mdbx_env_get_path(MDB_env *env, const char **arg) { + if (unlikely(!env || !arg)) + return EINVAL; + + *arg = env->me_path; + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_fd(MDB_env *env, int *arg) { + if (unlikely(!env || !arg)) + return EINVAL; + + *arg = env->me_fd; + return MDB_SUCCESS; +} + +/** Common code for #mdbx_stat() and #mdbx_env_stat(). + * @param[in] env the environment to operate in. + * @param[in] db the #MDB_db record containing the stats to return. + * @param[out] arg the address of an #MDB_stat structure to receive the stats. + * @return 0, this function always succeeds. + */ +static int __cold mdbx_stat0(MDB_env *env, MDB_db *db, MDBX_stat *arg) { + arg->ms_psize = env->me_psize; + arg->ms_depth = db->md_depth; + arg->ms_branch_pages = db->md_branch_pages; + arg->ms_leaf_pages = db->md_leaf_pages; + arg->ms_overflow_pages = db->md_overflow_pages; + arg->ms_entries = db->md_entries; + + return MDB_SUCCESS; +} + +int __cold mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) { + MDB_meta *meta; + + if (unlikely(env == NULL || arg == NULL)) + return EINVAL; + if (unlikely(bytes != sizeof(MDBX_stat))) + return EINVAL; + + meta = mdbx_meta_head_r(env); + return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); +} + +int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { + MDB_meta *meta; + + if (unlikely(env == NULL || arg == NULL)) + return EINVAL; + + if (bytes != sizeof(MDBX_envinfo)) + return EINVAL; + + MDB_meta *m1, *m2; + MDB_reader *r; + unsigned i; + + m1 = METAPAGE_1(env); + m2 = METAPAGE_2(env); + + do { + meta = mdbx_meta_head_r(env); + arg->me_last_txnid = meta->mm_txnid; + arg->me_last_pgno = meta->mm_last_pg; + arg->me_meta1_txnid = m1->mm_txnid; + arg->me_meta1_sign = m1->mm_datasync_sign; + arg->me_meta2_txnid = m2->mm_txnid; + arg->me_meta2_sign = m2->mm_datasync_sign; + } while (unlikely(arg->me_last_txnid != env->me_txns->mti_txnid || + arg->me_meta1_sign != m1->mm_datasync_sign || + arg->me_meta2_sign != m2->mm_datasync_sign)); + + arg->me_mapaddr = meta->mm_address; + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + arg->me_numreaders = env->me_txns->mti_numreaders; + arg->me_tail_txnid = 0; + + r = env->me_txns->mti_readers; + arg->me_tail_txnid = arg->me_last_txnid; + for (i = 0; i < arg->me_numreaders; ++i) { + if (r[i].mr_pid) { + txnid_t mr = r[i].mr_txnid; + if (arg->me_tail_txnid > mr) + arg->me_tail_txnid = mr; + } + } + + return MDB_SUCCESS; +} + +static MDB_cmp_func *mdbx_default_keycmp(unsigned flags) { + return (flags & MDB_REVERSEKEY) ? mdbx_cmp_memnr : (flags & MDB_INTEGERKEY) + ? mdbx_cmp_int_a2 + : mdbx_cmp_memn; +} + +static MDB_cmp_func *mdbx_default_datacmp(unsigned flags) { + return !(flags & MDB_DUPSORT) + ? 0 + : ((flags & MDB_INTEGERDUP) + ? mdbx_cmp_int_ua + : ((flags & MDB_REVERSEDUP) ? mdbx_cmp_memnr + : mdbx_cmp_memn)); +} + +/** Set the default comparison functions for a database. + * Called immediately after a database is opened to set the defaults. + * The user can then override them with #mdbx_set_compare() or + * #mdbx_set_dupsort(). + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + */ +static void mdbx_default_cmp(MDB_txn *txn, MDB_dbi dbi) { + unsigned flags = txn->mt_dbs[dbi].md_flags; + txn->mt_dbxs[dbi].md_cmp = mdbx_default_keycmp(flags); + txn->mt_dbxs[dbi].md_dcmp = mdbx_default_datacmp(flags); +} + +int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, + MDB_dbi *dbi) { + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + MDB_db dummy; + int rc, dbflag, exact; + unsigned unused = 0, seq; + char *namedup; + size_t len; + + if (unlikely(!txn || !dbi)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(flags & ~VALID_FLAGS)) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + /* main DB? */ + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + /* make sure flag changes get committed */ + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != + txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } + mdbx_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; + } + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdbx_default_cmp(txn, MAIN_DBI); + } + + /* Is the DB already open? */ + len = strlen(name); + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + /* Remember this free slot */ + if (!unused) + unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + /* If no free slot and max hit, fail */ + if (!unused && unlikely(txn->mt_numdbs >= txn->mt_env->me_maxdbs)) + return MDB_DBS_FULL; + + /* Cannot mix named databases with some mainDB flags */ + if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT | MDB_INTEGERKEY))) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + /* Find the DB info */ + dbflag = DB_NEW | DB_VALID | DB_USRVALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdbx_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (likely(rc == MDB_SUCCESS)) { + /* make sure this is actually a DB */ + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (unlikely((node->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDB_INCOMPATIBLE; + } else if (!(rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { + return rc; + } + + /* Done here so we cannot fail after creating a new DB */ + if (unlikely((namedup = strdup(name)) == NULL)) + return ENOMEM; + + if (unlikely(rc)) { + /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &key, &data, F_SUBDATA)); + dbflag |= DB_DIRTY; + } + + if (unlikely(rc)) { + free(namedup); + } else { + /* Got info, register DBI in this txn */ + unsigned slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = namedup; + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + /* txn-> and env-> are the same in read txns, use + * tmp variable to avoid undefined assignment + */ + seq = ++txn->mt_env->me_dbiseqs[slot]; + txn->mt_dbiseqs[slot] = seq; + + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdbx_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; +} + +int __cold mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) { + if (unlikely(!arg || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + return EINVAL; + + if (unlikely(bytes != sizeof(MDBX_stat))) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdbx_cursor_init(&mc, txn, dbi, &mx); + } + return mdbx_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); +} + +void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { + char *ptr; + if (dbi < CORE_DBS || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + /* If there was no name, this was already closed */ + if (ptr) { + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + free(ptr); + } +} + +int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) { + if (unlikely(!txn || !flags)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + return EINVAL; + + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; +} + +/** Add all the DB's pages to the free list. + * @param[in] mc Cursor on the DB to free. + * @param[in] subs non-Zero to check for sub-DBs in this DB. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_drop0(MDB_cursor *mc, int subs) { + int rc; + + rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); + if (likely(rc == MDB_SUCCESS)) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned i; + + /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. + * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. + */ + if ((mc->mc_flags & C_SUB) || (!subs && !mc->mc_db->md_overflow_pages)) + mdbx_cursor_pop(mc); + + mdbx_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i = 0; i < n; i++) { + ni = NODEPTR(mp, i); + if (ni->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdbx_page_get(mc, pg, &omp, NULL); + if (unlikely(rc)) + goto done; + mdbx_cassert(mc, IS_OVERFLOW(omp)); + rc = mdbx_midl_append_range(&txn->mt_free_pgs, pg, omp->mp_pages); + if (unlikely(rc)) + goto done; + mc->mc_db->md_overflow_pages -= omp->mp_pages; + if (!mc->mc_db->md_overflow_pages && !subs) + break; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdbx_xcursor_init1(mc, ni); + rc = mdbx_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (unlikely(rc)) + goto done; + } + } + if (!subs && !mc->mc_db->md_overflow_pages) + goto pop; + } else { + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, n)) != 0)) + goto done; + for (i = 0; i < n; i++) { + pgno_t pg; + ni = NODEPTR(mp, i); + pg = NODEPGNO(ni); + /* free it */ + mdbx_midl_xappend(txn->mt_free_pgs, pg); + } + } + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdbx_cursor_sibling(mc, 1); + if (rc) { + if (unlikely(rc != MDB_NOTFOUND)) + goto done; + /* no more siblings, go back to beginning + * of previous level. + */ + pop: + mdbx_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i = 1; i < mc->mc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + /* free it */ + rc = mdbx_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); + done: + if (unlikely(rc)) + txn->mt_flags |= MDB_TXN_ERROR; + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; + } + mc->mc_flags &= ~C_INITIALIZED; + return rc; +} + +int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del) { + MDB_cursor *mc, *m2; + int rc; + + if (unlikely(1 < (unsigned)del || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) + return MDB_BAD_DBI; + + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + return EACCES; + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (unlikely(rc)) + return rc; + + rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + /* Invalidate the dropped DB's cursors */ + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED | C_EOF); + if (unlikely(rc)) + goto leave; + + /* Can't delete the main DB */ + if (del && dbi >= CORE_DBS) { + rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + if (likely(!rc)) { + txn->mt_dbflags[dbi] = DB_STALE; + mdbx_dbi_close(txn->mt_env, dbi); + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + } else { + /* reset the DB record, mark it dirty */ + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } +leave: + mdbx_cursor_close(mc); + return rc; +} + +int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_cmp = cmp; + return MDB_SUCCESS; +} + +int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_dcmp = cmp; + return MDB_SUCCESS; +} + +int mdbx_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_rel = rel; + return MDB_SUCCESS; +} + +int mdbx_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_relctx = ctx; + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_maxkeysize(MDB_env *env) { + if (!env || env->me_signature != MDBX_ME_SIGNATURE) + return EINVAL; + return ENV_MAXKEY(env); +} + +int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { + unsigned i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; + + if (unlikely(!env || !func)) + return -EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i = 0; i < rdrs; i++) { + if (mr[i].mr_pid) { + txnid_t txnid = mr[i].mr_txnid; + if (txnid == ~(txnid_t)0) + sprintf(buf, "%10d %zx -\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid); + else + sprintf(buf, "%10d %zx %zu\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, + txnid); + + if (first) { + first = 0; + rc = func(" pid thread txnid\n", ctx); + if (rc < 0) + break; + } + rc = func(buf, ctx); + if (rc < 0) + break; + } + } + if (first) { + rc = func("(no active readers)\n", ctx); + } + return rc; +} + +/** Insert pid into list if not already present. + * return -1 if already present. + */ +static int __cold mdbx_pid_insert(pid_t *ids, pid_t pid) { + /* binary search of pid in list */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while (0 < n) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = pid - ids[cursor]; + + if (val < 0) { + n = pivot; + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + } else { + /* found, so it's a duplicate */ + return -1; + } + } + + if (val > 0) { + ++cursor; + } + ids[0]++; + for (n = ids[0]; n > cursor; n--) + ids[n] = ids[n - 1]; + ids[n] = pid; + return 0; +} + +int __cold mdbx_reader_check(MDB_env *env, int *dead) { + if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) + return EINVAL; + if (dead) + *dead = 0; + return mdbx_reader_check0(env, 0, dead); +} + +/** As #mdbx_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ +static int __cold mdbx_reader_check0(MDB_env *env, int rlocked, int *dead) { + pthread_mutex_t *rmutex = rlocked ? NULL : MDB_MUTEX(env, r); + unsigned i, j, rdrs; + MDB_reader *mr; + pid_t *pids, pid; + int rc = MDB_SUCCESS, count = 0; + + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs + 1) * sizeof(pid_t)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i = 0; i < rdrs; i++) { + pid = mr[i].mr_pid; + if (pid && pid != env->me_pid) { + if (mdbx_pid_insert(pids, pid) == 0) { + if (!mdbx_reader_pid(env, F_GETLK, pid)) { + /* Stale reader found */ + j = i; + if (rmutex) { + if ((rc = pthread_mutex_lock(rmutex)) != 0) { + if ((rc = mdbx_mutex_failed(env, rmutex, rc))) + break; + rdrs = 0; /* the above checked all readers */ + } else { + /* Recheck, a new process may have reused pid */ + if (mdbx_reader_pid(env, F_GETLK, pid)) + j = rdrs; + } + } + for (; j < rdrs; j++) { + if (mr[j].mr_pid == pid) { + mdbx_debug("clear stale reader pid %u txn %zd", (unsigned)pid, + mr[j].mr_txnid); + mr[j].mr_pid = 0; + count++; + } + } + if (rmutex) + mdbx_mutex_unlock(env, rmutex); + } + } + } + } + free(pids); + if (dead) + *dead = count; + return rc; +} + +static int __cold mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, + int rc) { +#if MDB_USE_ROBUST + if (unlikely(rc == EOWNERDEAD)) { + int rlocked, rc2; + + /* We own the mutex. Clean up after dead previous owner. */ + rc = MDB_SUCCESS; + rlocked = (mutex == MDB_MUTEX(env, r)); + if (!rlocked) { + /* Keep mti_txnid updated, otherwise next writer can + * overwrite data which latest meta page refers to. + * + * LY: Hm, how this can happen, if the mti_txnid + * is updating only at the finish of a successful commit ? + */ + + MDB_meta *meta = mdbx_meta_head_w(env); + assert(env->me_txns->mti_txnid == meta->mm_txnid); + (void)meta; + /* env is hosed if the dead thread was ours */ + if (env->me_txn) { + env->me_flags |= MDB_FATAL_ERROR; + env->me_txn = NULL; + rc = MDB_PANIC; + } + } + mdbx_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + rc2 = mdbx_reader_check0(env, rlocked, NULL); + if (rc2 == 0) + rc2 = pthread_mutex_consistent(mutex); + if (rc || (rc = rc2)) { + mdbx_debug("mutex recovery failed, %s", mdbx_strerror(rc)); + pthread_mutex_unlock(mutex); + } + } +#endif /* MDB_USE_ROBUST */ + if (unlikely(rc)) { + mdbx_debug("lock mutex failed, %s", mdbx_strerror(rc)); + if (rc != EDEADLK) { + env->me_flags |= MDB_FATAL_ERROR; + rc = MDB_PANIC; + } + } + + return rc; +} + +static int mdbx_mutex_lock(MDB_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_lock(mutex); + if (unlikely(rc)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +static void mdbx_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_unlock(mutex); + mdbx_assert(env, rc == 0); + (void)env; + (void)rc; +} + +static unsigned __hot mdbx_midl_search(MDB_IDL ids, MDB_ID id) { + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while (0 < n) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = mdbx_cmp2int(ids[cursor], id); + + if (val < 0) { + n = pivot; + + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if (val > 0) { + ++cursor; + } + return cursor; +} + +static MDB_IDL mdbx_midl_alloc(int num) { + MDB_IDL ids = malloc((num + 2) * sizeof(MDB_ID)); + if (ids) { + *ids++ = num; + *ids = 0; + } + return ids; +} + +static void mdbx_midl_free(MDB_IDL ids) { + if (ids) + free(ids - 1); +} + +static void mdbx_midl_shrink(MDB_IDL *idp) { + MDB_IDL ids = *idp; + if (*(--ids) > MDB_IDL_UM_MAX && + (ids = realloc(ids, (MDB_IDL_UM_MAX + 2) * sizeof(MDB_ID)))) { + *ids++ = MDB_IDL_UM_MAX; + *idp = ids; + } +} + +static int mdbx_midl_grow(MDB_IDL *idp, int num) { + MDB_IDL idn = *idp - 1; + /* grow it */ + idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); + if (!idn) + return ENOMEM; + *idn++ += num; + *idp = idn; + return 0; +} + +static int mdbx_midl_need(MDB_IDL *idp, unsigned num) { + MDB_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num / 4 + (256 + 2)) & -256; + if (!(ids = realloc(ids - 1, num * sizeof(MDB_ID)))) + return ENOMEM; + *ids++ = num - 2; + *idp = ids; + } + return 0; +} + +static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id) { + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] >= ids[-1]) { + if (mdbx_midl_grow(idp, MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0]++; + ids[ids[0]] = id; + return 0; +} + +static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app) { + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] + app[0] >= ids[-1]) { + if (mdbx_midl_grow(idp, app[0])) + return ENOMEM; + ids = *idp; + } + memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(MDB_ID)); + ids[0] += app[0]; + return 0; +} + +static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n) { + MDB_ID *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdbx_midl_grow(idp, n | MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} + +static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { + MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k; + idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ + old_id = idl[j]; + while (i) { + merge_id = merge[i--]; + for (; old_id < merge_id; old_id = idl[--j]) + idl[k--] = old_id; + idl[k--] = merge_id; + } + idl[0] = total; +} + +/* Quicksort + Insertion sort for small arrays */ + +#define SMALL 8 +#define MIDL_SWAP(a, b) \ + { \ + itmp = (a); \ + (a) = (b); \ + (b) = itmp; \ + } + +static void __hot mdbx_midl_sort(MDB_IDL ids) { + /* Max possible depth of int-indexed tree * 2 items/level */ + int istack[sizeof(int) * CHAR_BIT * 2]; + int i, j, k, l, ir, jstack; + MDB_ID a, itmp; + + ir = (int)ids[0]; + l = 1; + jstack = 0; + for (;;) { + if (ir - l < SMALL) { /* Insertion sort */ + for (j = l + 1; j <= ir; j++) { + a = ids[j]; + for (i = j - 1; i >= 1; i--) { + if (ids[i] >= a) + break; + ids[i + 1] = ids[i]; + } + ids[i + 1] = a; + } + if (jstack == 0) + break; + ir = istack[jstack--]; + l = istack[jstack--]; + } else { + k = (l + ir) >> 1; /* Choose median of left, center, right */ + MIDL_SWAP(ids[k], ids[l + 1]); + if (ids[l] < ids[ir]) { + MIDL_SWAP(ids[l], ids[ir]); + } + if (ids[l + 1] < ids[ir]) { + MIDL_SWAP(ids[l + 1], ids[ir]); + } + if (ids[l] < ids[l + 1]) { + MIDL_SWAP(ids[l], ids[l + 1]); + } + i = l + 1; + j = ir; + a = ids[l + 1]; + for (;;) { + do + i++; + while (ids[i] > a); + do + j--; + while (ids[j] < a); + if (j < i) + break; + MIDL_SWAP(ids[i], ids[j]); + } + ids[l + 1] = ids[j]; + ids[j] = a; + jstack += 2; + if (ir - i + 1 >= j - l) { + istack[jstack] = ir; + istack[jstack - 1] = i; + ir = j - 1; + } else { + istack[jstack] = j - 1; + istack[jstack - 1] = l; + l = i; + } + } + } +} + +static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id) { + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while (0 < n) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = mdbx_cmp2int(id, ids[cursor].mid); + + if (val < 0) { + n = pivot; + + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if (val > 0) { + ++cursor; + } + return cursor; +} + +static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { + unsigned x, i; + + x = mdbx_mid2l_search(ids, id->mid); + + if (x < 1) { + /* internal error */ + return -2; + } + + if (x <= ids[0].mid && ids[x].mid == id->mid) { + /* duplicate */ + return -1; + } + + if (ids[0].mid >= MDB_IDL_UM_MAX) { + /* too big */ + return -2; + + } else { + /* insert id */ + ids[0].mid++; + for (i = (unsigned)ids[0].mid; i > x; i--) + ids[i] = ids[i - 1]; + ids[x] = *id; + } + + return 0; +} + +static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id) { + /* Too big? */ + if (ids[0].mid >= MDB_IDL_UM_MAX) { + return -2; + } + ids[0].mid++; + ids[ids[0].mid] = *id; + return 0; +} + +int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn) { + unsigned ret = mdbx_runtime_flags; + if (flags != (int)MDBX_DBG_DNT) + mdbx_runtime_flags = flags; + if (logger != (MDBX_debug_func *)MDBX_DBG_DNT) + mdbx_debug_logger = logger; + if (edge_txn != (long)MDBX_DBG_DNT) { +#if MDB_DEBUG + mdbx_debug_edge = edge_txn; +#endif + } + return ret; +} + +static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { + int retry; + txnid_t snap; + mdbx_debug("DB size maxed out"); + + for (retry = 0;; ++retry) { + int reader; + + if (mdbx_reader_check(env, NULL)) + break; + + snap = mdbx_find_oldest(env, &reader); + if (oldest < snap || reader < 0) { + if (retry && env->me_oom_func) { + /* LY: notify end of oom-loop */ + env->me_oom_func(env, 0, 0, oldest, snap - oldest, -retry); + } + return snap; + } + + MDB_reader *r; + pthread_t tid; + pid_t pid; + int rc; + + if (!env->me_oom_func) + break; + + r = &env->me_txns->mti_readers[reader]; + pid = r->mr_pid; + tid = r->mr_tid; + if (r->mr_txnid != oldest || pid <= 0) + continue; + + rc = env->me_oom_func(env, pid, (void *)tid, oldest, + mdbx_meta_head_w(env)->mm_txnid - oldest, retry); + if (rc < 0) + break; + + if (rc) { + r->mr_txnid = ~(txnid_t)0; + if (rc > 1) { + r->mr_tid = 0; + r->mr_pid = 0; + mdbx_coherent_barrier(); + } + } + } + + if (retry && env->me_oom_func) { + /* LY: notify end of oom-loop */ + env->me_oom_func(env, 0, 0, oldest, 0, -retry); + } + return mdbx_find_oldest(env, NULL); +} + +int __cold mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) { + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + env->me_sync_threshold = bytes; + return env->me_map ? mdbx_env_sync(env, 0) : MDB_SUCCESS; +} + +void __cold mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) { + if (likely(env && env->me_signature == MDBX_ME_SIGNATURE)) + env->me_oom_func = oomfunc; +} + +MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDB_env *env) { + return likely(env && env->me_signature == MDBX_ME_SIGNATURE) + ? env->me_oom_func + : NULL; +} + +ATTRIBUTE_NO_SANITIZE_THREAD /* LY: avoid tsan-trap by me_txn, mm_last_pg and + mt_next_pgno */ + int + mdbx_txn_straggler(MDB_txn *txn, int *percent) { + MDB_env *env; + MDB_meta *meta; + txnid_t lag; + + if (unlikely(!txn)) + return -EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!txn->mt_u.reader)) + return -1; + + env = txn->mt_env; + meta = mdbx_meta_head_r(env); + if (percent) { + size_t maxpg = env->me_maxpg; + size_t last = meta->mm_last_pg + 1; + if (env->me_txn) + last = env->me_txn0->mt_next_pgno; + *percent = (last * 100ull + maxpg / 2) / maxpg; + } + lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; + return (0 > (long)lag) ? ~0u >> 1 : lag; +} + +typedef struct mdbx_walk_ctx { + MDB_txn *mw_txn; + void *mw_user; + MDBX_pgvisitor_func *mw_visitor; +} mdbx_walk_ctx_t; + +/** Depth-first tree traversal. */ +static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, + pgno_t pg, int deep) { + MDB_page *mp; + int rc, i, nkeys; + unsigned header_size, unused_size, payload_size, align_bytes; + const char *type; + + if (pg == P_INVALID) + return MDB_SUCCESS; /* empty db */ + + MDB_cursor mc; + memset(&mc, 0, sizeof(mc)); + mc.mc_snum = 1; + mc.mc_txn = ctx->mw_txn; + + rc = mdbx_page_get(&mc, pg, &mp, NULL); + if (rc) + return rc; + if (pg != mp->mp_p.p_pgno) + return MDB_CORRUPTED; + + nkeys = NUMKEYS(mp); + header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; + unused_size = SIZELEFT(mp); + payload_size = 0; + + /* LY: Don't use mask here, e.g bitwise + * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). + * Pages should not me marked dirty/loose or otherwise. */ + switch (mp->mp_flags) { + case P_BRANCH: + type = "branch"; + if (nkeys < 1) + return MDB_CORRUPTED; + break; + case P_LEAF: + type = "leaf"; + break; + case P_LEAF | P_SUBP: + type = "dupsort-subleaf"; + break; + case P_LEAF | P_LEAF2: + type = "dupfixed-leaf"; + break; + case P_LEAF | P_LEAF2 | P_SUBP: + type = "dupsort-dupfixed-subleaf"; + break; + case P_META: + case P_OVERFLOW: + default: + return MDB_CORRUPTED; + } + + for (align_bytes = i = 0; i < nkeys; + align_bytes += ((payload_size + align_bytes) & 1), i++) { + MDB_node *node; + + if (IS_LEAF2(mp)) { + /* LEAF2 pages have no mp_ptrs[] or node headers */ + payload_size += mp->mp_leaf2_ksize; + continue; + } + + node = NODEPTR(mp, i); + payload_size += NODESIZE + node->mn_ksize; + + if (IS_BRANCH(mp)) { + rc = mdbx_env_walk(ctx, dbi, NODEPGNO(node), deep); + if (rc) + return rc; + continue; + } + + assert(IS_LEAF(mp)); + if (node->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t *opg; + size_t over_header, over_payload, over_unused; + + payload_size += sizeof(pgno_t); + opg = NODEDATA(node); + rc = mdbx_page_get(&mc, *opg, &omp, NULL); + if (rc) + return rc; + if (*opg != omp->mp_p.p_pgno) + return MDB_CORRUPTED; + /* LY: Don't use mask here, e.g bitwise + * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). + * Pages should not me marked dirty/loose or otherwise. */ + if (P_OVERFLOW != omp->mp_flags) + return MDB_CORRUPTED; + + over_header = PAGEHDRSZ; + over_payload = NODEDSZ(node); + over_unused = omp->mp_pages * ctx->mw_txn->mt_env->me_psize - + over_payload - over_header; + + rc = ctx->mw_visitor(*opg, omp->mp_pages, ctx->mw_user, dbi, + "overflow-data", 1, over_payload, over_header, + over_unused); + if (rc) + return rc; + continue; + } + + payload_size += NODEDSZ(node); + if (node->mn_flags & F_SUBDATA) { + MDB_db *db = NODEDATA(node); + char *name = NULL; + + if (!(node->mn_flags & F_DUPDATA)) { + name = NODEKEY(node); + int namelen = (char *)db - name; + name = memcpy(alloca(namelen + 1), name, namelen); + name[namelen] = 0; + } + rc = mdbx_env_walk(ctx, (name && name[0]) ? name : dbi, db->md_root, + deep + 1); + if (rc) + return rc; + } + } + + return ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, type, nkeys, + payload_size, header_size, unused_size + align_bytes); +} + +int __cold mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, + void *user) { + mdbx_walk_ctx_t ctx; + int rc; + + if (unlikely(!txn)) + return MDB_BAD_TXN; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + ctx.mw_txn = txn; + ctx.mw_user = user; + ctx.mw_visitor = visitor; + + rc = visitor(0, 2, user, "lmdb", "meta", 2, sizeof(MDB_meta) * 2, + PAGEHDRSZ * 2, + (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) * 2); + if (!rc) + rc = mdbx_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); + if (!rc) + rc = mdbx_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0); + if (!rc) + rc = visitor(P_INVALID, 0, user, NULL, NULL, 0, 0, 0, 0); + return rc; +} + +int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + return EACCES; + + if (likely(canary)) { + txn->mt_canary.x = canary->x; + txn->mt_canary.y = canary->y; + txn->mt_canary.z = canary->z; + } + txn->mt_canary.v = txn->mt_txnid; + + return MDB_SUCCESS; +} + +size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return 0; + + if (likely(canary)) + *canary = txn->mt_canary; + + return txn->mt_txnid; +} + +int mdbx_cursor_on_first(MDB_cursor *mc) { + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_FALSE; + + unsigned i; + for (i = 0; i < mc->mc_snum; ++i) { + if (mc->mc_ki[i]) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + +int mdbx_cursor_on_last(MDB_cursor *mc) { + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_FALSE; + + unsigned i; + for (i = 0; i < mc->mc_snum; ++i) { + unsigned nkeys = NUMKEYS(mc->mc_pg[i]); + if (mc->mc_ki[i] < nkeys - 1) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + +int mdbx_cursor_eof(MDB_cursor *mc) { + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if ((mc->mc_flags & C_INITIALIZED) == 0) + return MDBX_RESULT_TRUE; + + if (mc->mc_snum == 0) + return MDBX_RESULT_TRUE; + + if ((mc->mc_flags & C_EOF) && + mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDBX_RESULT_TRUE; + + return MDBX_RESULT_FALSE; +} + +static int mdbx_is_samedata(const MDB_val *a, const MDB_val *b) { + return a->iov_len == b->iov_len && + memcmp(a->iov_base, b->iov_base, a->iov_len) == 0; } /* Позволяет обновить или удалить существующую запись с получением @@ -444,58 +11300,59 @@ static int mdbx_is_samedata(const MDB_val* a, const MDB_val* b) { * - внешняя аллокация курсоров, в том числе на стеке (без malloc). * - получения статуса страницы по адресу (знать о P_DIRTY). */ -int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags) -{ - MDB_cursor mc; - MDB_xcursor mx; +int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, + MDB_val *old_data, unsigned flags) { + MDB_cursor mc; + MDB_xcursor mx; - if (unlikely(!key || !old_data || !txn || old_data == new_data)) - return EINVAL; + if (unlikely(!key || !old_data || !txn || old_data == new_data)) + return EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; - if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) - return EINVAL; + if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) + return EINVAL; - if (unlikely(new_data == NULL && !(flags & MDB_CURRENT))) - return EINVAL; + if (unlikely(new_data == NULL && !(flags & MDB_CURRENT))) + return EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; - if (unlikely(flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP|MDB_CURRENT))) - return EINVAL; + if (unlikely(flags & + ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | + MDB_APPENDDUP | MDB_CURRENT))) + return EINVAL; - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - mdb_cursor_init(&mc, txn, dbi, &mx); - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; + mdbx_cursor_init(&mc, txn, dbi, &mx); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; - int rc; - MDB_val present_key = *key; - if (F_ISSET(flags, MDB_CURRENT | MDB_NOOVERWRITE)) { - /* в old_data значение для выбора конкретного дубликата */ - if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDB_DUPSORT))) { - rc = EINVAL; - goto bailout; - } + int rc; + MDB_val present_key = *key; + if (F_ISSET(flags, MDB_CURRENT | MDB_NOOVERWRITE)) { + /* в old_data значение для выбора конкретного дубликата */ + if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDB_DUPSORT))) { + rc = EINVAL; + goto bailout; + } - /* убираем лишний бит, он был признаком запрошенного режима */ - flags -= MDB_NOOVERWRITE; + /* убираем лишний бит, он был признаком запрошенного режима */ + flags -= MDB_NOOVERWRITE; - rc = mdbx_cursor_get(&mc, &present_key, old_data, MDB_GET_BOTH); - if (rc != MDB_SUCCESS) - goto bailout; + rc = mdbx_cursor_get(&mc, &present_key, old_data, MDB_GET_BOTH); + if (rc != MDB_SUCCESS) + goto bailout; - if (new_data) { - /* обновление конкретного дубликата */ - if (mdbx_is_samedata(old_data, new_data)) - /* если данные совпадают, то ничего делать не надо */ - goto bailout; + if (new_data) { + /* обновление конкретного дубликата */ + if (mdbx_is_samedata(old_data, new_data)) + /* если данные совпадают, то ничего делать не надо */ + goto bailout; #if 0 /* LY: исправлено в mdbx_cursor_put(), здесь в качестве памятки */ MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA) @@ -516,126 +11373,127 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, flags -= MDB_CURRENT; } #endif - } - } else { - /* в old_data буфер для сохранения предыдущего значения */ - if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) - return EINVAL; - MDB_val present_data; - rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDB_SET_KEY); - if (unlikely(rc != MDB_SUCCESS)) { - old_data->iov_base = NULL; - old_data->iov_len = rc; - if (rc != MDB_NOTFOUND || (flags & MDB_CURRENT)) - goto bailout; - } else if (flags & MDB_NOOVERWRITE) { - rc = MDB_KEYEXIST; - *old_data = present_data; - goto bailout; - } else { - MDB_page *page = mc.mc_pg[mc.mc_top]; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - if (flags & MDB_CURRENT) { - /* для не-уникальных ключей позволяем update/delete только если ключ один */ - MDB_node *leaf = NODEPTR(page, mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_tassert(txn, XCURSOR_INITED(&mc) && mc.mc_xcursor->mx_db.md_entries > 1); - if (mc.mc_xcursor->mx_db.md_entries > 1) { - rc = MDBX_EMULTIVAL; - goto bailout; - } - } - /* если данные совпадают, то ничего делать не надо */ - if (new_data && mdbx_is_samedata(&present_data, new_data)) { - *old_data = *new_data; - goto bailout; - } - /* В оригинальной LMDB фладок MDB_CURRENT здесь приведет - * к замене данных без учета MDB_DUPSORT сортировки, - * но здесь это в любом случае допустимо, так как мы - * проверили что для ключа есть только одно значение. */ - } else if ((flags & MDB_NODUPDATA) && mdbx_is_samedata(&present_data, new_data)) { - /* если данные совпадают и установлен MDB_NODUPDATA */ - rc = MDB_KEYEXIST; - goto bailout; - } - } else { - /* если данные совпадают, то ничего делать не надо */ - if (new_data && mdbx_is_samedata(&present_data, new_data)) { - *old_data = *new_data; - goto bailout; - } - flags |= MDB_CURRENT; - } + } + } else { + /* в old_data буфер для сохранения предыдущего значения */ + if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) + return EINVAL; + MDB_val present_data; + rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDB_SET_KEY); + if (unlikely(rc != MDB_SUCCESS)) { + old_data->iov_base = NULL; + old_data->iov_len = rc; + if (rc != MDB_NOTFOUND || (flags & MDB_CURRENT)) + goto bailout; + } else if (flags & MDB_NOOVERWRITE) { + rc = MDB_KEYEXIST; + *old_data = present_data; + goto bailout; + } else { + MDB_page *page = mc.mc_pg[mc.mc_top]; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + if (flags & MDB_CURRENT) { + /* для не-уникальных ключей позволяем update/delete только если ключ + * один */ + MDB_node *leaf = NODEPTR(page, mc.mc_ki[mc.mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_tassert(txn, XCURSOR_INITED(&mc) && + mc.mc_xcursor->mx_db.md_entries > 1); + if (mc.mc_xcursor->mx_db.md_entries > 1) { + rc = MDBX_EMULTIVAL; + goto bailout; + } + } + /* если данные совпадают, то ничего делать не надо */ + if (new_data && mdbx_is_samedata(&present_data, new_data)) { + *old_data = *new_data; + goto bailout; + } + /* В оригинальной LMDB фладок MDB_CURRENT здесь приведет + * к замене данных без учета MDB_DUPSORT сортировки, + * но здесь это в любом случае допустимо, так как мы + * проверили что для ключа есть только одно значение. */ + } else if ((flags & MDB_NODUPDATA) && + mdbx_is_samedata(&present_data, new_data)) { + /* если данные совпадают и установлен MDB_NODUPDATA */ + rc = MDB_KEYEXIST; + goto bailout; + } + } else { + /* если данные совпадают, то ничего делать не надо */ + if (new_data && mdbx_is_samedata(&present_data, new_data)) { + *old_data = *new_data; + goto bailout; + } + flags |= MDB_CURRENT; + } - if (page->mp_flags & P_DIRTY) { - if (unlikely(old_data->iov_len < present_data.iov_len)) { - old_data->iov_base = NULL; - old_data->iov_len = present_data.iov_len; - rc = MDBX_RESULT_TRUE; - goto bailout; - } - memcpy(old_data->iov_base, present_data.iov_base, present_data.iov_len); - old_data->iov_len = present_data.iov_len; - } else { - *old_data = present_data; - } - } - } + if (page->mp_flags & P_DIRTY) { + if (unlikely(old_data->iov_len < present_data.iov_len)) { + old_data->iov_base = NULL; + old_data->iov_len = present_data.iov_len; + rc = MDBX_RESULT_TRUE; + goto bailout; + } + memcpy(old_data->iov_base, present_data.iov_base, present_data.iov_len); + old_data->iov_len = present_data.iov_len; + } else { + *old_data = present_data; + } + } + } - if (likely(new_data)) - rc = mdbx_cursor_put(&mc, key, new_data, flags); - else - rc = mdbx_cursor_del(&mc, 0); + if (likely(new_data)) + rc = mdbx_cursor_put(&mc, key, new_data, flags); + else + rc = mdbx_cursor_del(&mc, 0); bailout: - txn->mt_cursors[dbi] = mc.mc_next; - return rc; + txn->mt_cursors[dbi] = mc.mc_next; + return rc; } -int -mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, int* values_count) -{ - DKBUF; - mdb_debug("===> get db %u key [%s]", dbi, DKEY(key)); +int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + int *values_count) { + DKBUF; + mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - if (unlikely(!key || !data || !txn)) - return EINVAL; + if (unlikely(!key || !data || !txn)) + return EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; - MDB_cursor mc; - MDB_xcursor mx; - mdb_cursor_init(&mc, txn, dbi, &mx); + MDB_cursor mc; + MDB_xcursor mx; + mdbx_cursor_init(&mc, txn, dbi, &mx); - int exact = 0; - int rc = mdb_cursor_set(&mc, key, data, MDB_SET_KEY, &exact); - if (unlikely(rc != MDB_SUCCESS)) { - if (rc == MDB_NOTFOUND && values_count) - *values_count = 0; - return rc; - } + int exact = 0; + int rc = mdbx_cursor_set(&mc, key, data, MDB_SET_KEY, &exact); + if (unlikely(rc != MDB_SUCCESS)) { + if (rc == MDB_NOTFOUND && values_count) + *values_count = 0; + return rc; + } - if (values_count) { - *values_count = 1; - if (mc.mc_xcursor != NULL) { - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_tassert(txn, mc.mc_xcursor == &mx - && (mx.mx_cursor.mc_flags & C_INITIALIZED)); - *values_count = mx.mx_db.md_entries; - } - } - } - return MDB_SUCCESS; + if (values_count) { + *values_count = 1; + if (mc.mc_xcursor != NULL) { + MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_tassert(txn, mc.mc_xcursor == &mx && + (mx.mx_cursor.mc_flags & C_INITIALIZED)); + *values_count = mx.mx_db.md_entries; + } + } + } + return MDB_SUCCESS; } /* Функция сообщает находится ли указанный адрес в "грязной" странице у @@ -660,80 +11518,79 @@ mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, * так гарантируется что актуальный заголовок страницы будет физически * расположен в той-же странице памяти, в том числе для многостраничных * P_OVERFLOW страниц с длинными данными. */ -int mdbx_is_dirty(const MDB_txn *txn, const void* ptr) -{ - if (unlikely(!txn)) - return EINVAL; +int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { + if (unlikely(!txn)) + return EINVAL; - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; - if (unlikely(txn->mt_flags & MDB_TXN_RDONLY)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDB_TXN_RDONLY)) + return MDB_BAD_TXN; - const MDB_env *env = txn->mt_env; - const uintptr_t mask = ~(uintptr_t) (env->me_psize - 1); - const MDB_page *page = (const MDB_page *) ((uintptr_t) ptr & mask); + const MDB_env *env = txn->mt_env; + const uintptr_t mask = ~(uintptr_t)(env->me_psize - 1); + const MDB_page *page = (const MDB_page *)((uintptr_t)ptr & mask); - /* LY: Тут не всё хорошо с абсолютной достоверностью результата, - * так как флажок P_DIRTY в LMDB может означать не совсем то, - * что было исходно задумано, детали см в логике кода mdb_page_touch(). - * - * Более того, в режиме БЕЗ WRITEMAP грязные страницы выделяются через - * malloc(), т.е. находятся вне mmap-диаппазона. - * - * Тем не менее, однозначно страница "не грязная" если: - * - адрес находится внутри mmap-диаппазона и в заголовке страницы - * нет флажка P_DIRTY, то однозначно страница "не грязная". - * - адрес вне mmap-диаппазона и его нет среди списка "грязных" страниц. - */ - if (env->me_map < (char*) page) { - const size_t used_size = env->me_psize * txn->mt_next_pgno; - if (env->me_map + used_size > (char*) page) { - /* страница внутри диапазона */ - if (page->mp_flags & P_DIRTY) - return MDBX_RESULT_TRUE; - return MDBX_RESULT_FALSE; - } - /* Гипотетически здесь возможна ситуация, когда указатель адресует что-то - * в пределах mmap, но за границей распределенных страниц. Это тяжелая - * ошибка, которой не возможно добиться без каких-то мега-нарушений. - * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ - mdb_tassert(txn, env->me_map + env->me_mapsize > (char*) page); - } - /* Страница вне mmap-диаппазона */ + /* LY: Тут не всё хорошо с абсолютной достоверностью результата, + * так как флажок P_DIRTY в LMDB может означать не совсем то, + * что было исходно задумано, детали см в логике кода mdbx_page_touch(). + * + * Более того, в режиме БЕЗ WRITEMAP грязные страницы выделяются через + * malloc(), т.е. находятся вне mmap-диаппазона. + * + * Тем не менее, однозначно страница "не грязная" если: + * - адрес находится внутри mmap-диаппазона и в заголовке страницы + * нет флажка P_DIRTY, то однозначно страница "не грязная". + * - адрес вне mmap-диаппазона и его нет среди списка "грязных" страниц. + */ + if (env->me_map < (char *)page) { + const size_t used_size = env->me_psize * txn->mt_next_pgno; + if (env->me_map + used_size > (char *)page) { + /* страница внутри диапазона */ + if (page->mp_flags & P_DIRTY) + return MDBX_RESULT_TRUE; + return MDBX_RESULT_FALSE; + } + /* Гипотетически здесь возможна ситуация, когда указатель адресует что-то + * в пределах mmap, но за границей распределенных страниц. Это тяжелая + * ошибка, которой не возможно добиться без каких-то мега-нарушений. + * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ + mdbx_tassert(txn, env->me_map + env->me_mapsize > (char *)page); + } + /* Страница вне mmap-диаппазона */ - if (env->me_flags & MDB_WRITEMAP) - /* Если MDB_WRITEMAP, то результат уже ясен. */ - return MDBX_RESULT_FALSE; + if (env->me_flags & MDB_WRITEMAP) + /* Если MDB_WRITEMAP, то результат уже ясен. */ + return MDBX_RESULT_FALSE; - /* Смотрим список грязных страниц у заданной транзакции. */ - MDB_ID2 *list = txn->mt_u.dirty_list; - if (list) { - unsigned i, n = list[0].mid; - for (i = 1; i <= n; i++) { - const MDB_page *dirty = list[i].mptr; - if (dirty == page) - return MDBX_RESULT_TRUE; - } - } + /* Смотрим список грязных страниц у заданной транзакции. */ + MDB_ID2 *list = txn->mt_u.dirty_list; + if (list) { + unsigned i, n = list[0].mid; + for (i = 1; i <= n; i++) { + const MDB_page *dirty = list[i].mptr; + if (dirty == page) + return MDBX_RESULT_TRUE; + } + } - /* При вложенных транзакциях, страница может быть в dirty-списке - * родительской транзакции, но в этом случае она будет скопирована перед - * изменением в текущей транзакции, т.е. относительно заданной транзакции - * проверяемый адрес "не грязный". */ - return MDBX_RESULT_FALSE; + /* При вложенных транзакциях, страница может быть в dirty-списке + * родительской транзакции, но в этом случае она будет скопирована перед + * изменением в текущей транзакции, т.е. относительно заданной транзакции + * проверяемый адрес "не грязный". */ + return MDBX_RESULT_FALSE; } int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *pdbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp) -{ - int rc = mdbx_dbi_open(txn, name, flags, pdbi); - if (likely(rc == MDB_SUCCESS)) { - MDB_dbi dbi = *pdbi; - unsigned flags = txn->mt_dbs[dbi].md_flags; - txn->mt_dbxs[dbi].md_cmp = keycmp ? keycmp : mdbx_default_keycmp(flags); - txn->mt_dbxs[dbi].md_dcmp = datacmp ? datacmp : mdbx_default_datacmp(flags); - } - return rc; + MDB_dbi *pdbi, MDB_cmp_func *keycmp, + MDB_cmp_func *datacmp) { + int rc = mdbx_dbi_open(txn, name, flags, pdbi); + if (likely(rc == MDB_SUCCESS)) { + MDB_dbi dbi = *pdbi; + unsigned flags = txn->mt_dbs[dbi].md_flags; + txn->mt_dbxs[dbi].md_cmp = keycmp ? keycmp : mdbx_default_keycmp(flags); + txn->mt_dbxs[dbi].md_dcmp = datacmp ? datacmp : mdbx_default_datacmp(flags); + } + return rc; } diff --git a/mdbx.h b/mdbx.h index 515e819e..b86136a4 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1,7 +1,13 @@ /* * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. + * + * This code is derived from "LMDB engine" written by + * Howard Chu (Symas Corporation), which itself derived from btree.c + * written by Martin Hedenfalk. + * + * --- + * + * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -10,214 +16,1740 @@ * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at * . + * + * --- + * + * Portions Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* - This is solution to provide flexible compatibility with the original liblmdb. - Yeah, this way is partially ugly and madness... - - But, on the other hand, only this way allows provide both API with - minimal changes the source code of an applications, and the source - code of the library itself. Anyway, ideas are welcome! - - So, - - When needed drop-in replacement for liblmdb you should: - - 'make lmdb' to build liblmdb.so and liblmdb.a; - - #include and use mdb_* functions; - - linking with liblmdb.so or liblmdb.a; - - = This provides nearly full compatibility with - original LMDB from Symas Corp. - But you should be noted - such compatibility - is not a goal for MDBX. - - When exactly the libmdbx is needed, you should: - - 'make mdbx' to build libmdbx.so and libmdbx.a; - - #include and use mdbx_* functions; - - linking with libmdbx.so or libmdbx.a; - - = This allows using (linking) both MDBX and LMDB - simultaneously in the one application, for instance - to benchmarking and/or comparison. -*/ - +#pragma once #ifndef _MDBX_H_ #define _MDBX_H_ #define MDBX_MODE_ENABLED 1 -#ifndef _GNU_SOURCE -# define _GNU_SOURCE -#endif - -/** @defgroup mdbx MDBX API - * @{ - * @brief libmdbx - Extended version of LMDB - */ - -#define mdb_version mdbx_version -#define mdb_strerror mdbx_strerror -#define mdb_env_create mdbx_env_create -#define mdb_env_open mdbx_env_open -#define mdb_env_open_ex mdbx_env_open_ex -#define mdb_env_copy mdbx_env_copy -#define mdb_env_copyfd mdbx_env_copyfd -#define mdb_env_copy2 mdbx_env_copy2 -#define mdb_env_copyfd2 mdbx_env_copyfd2 -#define mdb_env_sync mdbx_env_sync -#define mdb_env_close mdbx_env_close -#define mdb_env_set_flags mdbx_env_set_flags -#define mdb_env_get_flags mdbx_env_get_flags -#define mdb_env_get_path mdbx_env_get_path -#define mdb_env_get_fd mdbx_env_get_fd -#define mdb_env_set_mapsize mdbx_env_set_mapsize -#define mdb_env_set_maxreaders mdbx_env_set_maxreaders -#define mdb_env_get_maxreaders mdbx_env_get_maxreaders -#define mdb_env_set_maxdbs mdbx_env_set_maxdbs -#define mdb_env_get_maxkeysize mdbx_env_get_maxkeysize -#define mdb_env_set_userctx mdbx_env_set_userctx -#define mdb_env_get_userctx mdbx_env_get_userctx -#define mdb_env_set_assert mdbx_env_set_assert -#define mdb_txn_begin mdbx_txn_begin -#define mdb_txn_env mdbx_txn_env -#define mdb_txn_id mdbx_txn_id -#define mdb_txn_commit mdbx_txn_commit -#define mdb_txn_abort mdbx_txn_abort -#define mdb_txn_reset mdbx_txn_reset -#define mdb_txn_renew mdbx_txn_renew -#define mdb_dbi_open mdbx_dbi_open -#define mdb_dbi_flags mdbx_dbi_flags -#define mdb_dbi_close mdbx_dbi_close -#define mdb_drop mdbx_drop -#define mdb_set_compare mdbx_set_compare -#define mdb_set_dupsort mdbx_set_dupsort -#define mdb_set_relfunc mdbx_set_relfunc -#define mdb_set_relctx mdbx_set_relctx -#define mdb_get mdbx_get -#define mdb_put mdbx_put -#define mdb_del mdbx_del -#define mdb_cursor_open mdbx_cursor_open -#define mdb_cursor_close mdbx_cursor_close -#define mdb_cursor_renew mdbx_cursor_renew -#define mdb_cursor_txn mdbx_cursor_txn -#define mdb_cursor_dbi mdbx_cursor_dbi -#define mdb_cursor_get mdbx_cursor_get -#define mdb_cursor_put mdbx_cursor_put -#define mdb_cursor_del mdbx_cursor_del -#define mdb_cursor_count mdbx_cursor_count -#define mdb_cmp mdbx_cmp -#define mdb_dcmp mdbx_dcmp -#define mdb_reader_list mdbx_reader_list -#define mdb_reader_check mdbx_reader_check -#define mdb_dkey mdbx_dkey - -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdbx_open(txn,name,flags,dbi) mdbx_dbi_open(txn,name,flags,dbi) -#define mdbx_close(env,dbi) mdbx_dbi_close(env,dbi) - -#include "./lmdb.h" +#include +#include +#include +#include +#include #ifdef __cplusplus extern "C" { #endif -int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive); +/** Library major version */ +#define MDB_VERSION_MAJOR 0 +/** Library minor version */ +#define MDB_VERSION_MINOR 9 +/** Library patch version */ +#define MDB_VERSION_PATCH 19 + +/** Combine args a,b,c into a single integer for easy version comparisons */ +#define MDB_VERINT(a, b, c) (((a) << 24) | ((b) << 16) | (c)) + +/** The full library version as a single integer */ +#define MDB_VERSION_FULL \ + MDB_VERINT(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH) + +/** The release date of this library version */ +#define MDB_VERSION_DATE "DEVEL" + +/** A stringifier for the version info */ +#define MDB_VERSTR(a, b, c, d) \ + "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" + +/** A helper for the stringifier macro */ +#define MDB_VERFOO(a, b, c, d) MDB_VERSTR(a, b, c, d) + +/** The full library version as a C string */ +#define MDB_VERSION_STRING \ + MDB_VERFOO(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH, \ + MDB_VERSION_DATE) +/** @} */ + +/** @brief Opaque structure for a database environment. + * + * A DB environment supports multiple databases, all residing in the same + * shared-memory map. + */ +typedef struct MDB_env MDB_env; + +/** @brief Opaque structure for a transaction handle. + * + * All database operations require a transaction handle. Transactions may be + * read-only or read-write. + */ +typedef struct MDB_txn MDB_txn; + +/** @brief A handle for an individual database in the DB environment. */ +typedef unsigned MDB_dbi; + +/** @brief Opaque structure for navigating through a database */ +typedef struct MDB_cursor MDB_cursor; + +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 1 and #mdbx_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + */ +typedef struct iovec MDB_val; +#define mv_size iov_len +#define mv_data iov_base + +/** @brief A callback function used to compare two keys in a database */ +typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to relocate a position-dependent data item + * in a fixed-address database. + * + * The \b newptr gives the item's desired address in + * the memory map, and \b oldptr gives its previous address. The item's actual + * data resides at the address in \b item. This callback is expected to walk + * through the fields of the record in \b item and modify any + * values based at the \b oldptr address to be relative to the \b newptr + * address. + * @param[in,out] item The item that is to be relocated. + * @param[in] oldptr The previous address. + * @param[in] newptr The new address to relocate to. + * @param[in] relctx An application-provided context, set by + * #mdbx_set_relctx(). + * @todo This feature is currently unimplemented. + */ +typedef void(MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, + void *relctx); + +/** @defgroup mdbx_env Environment Flags + * @{ + */ +/** mmap at a fixed address (experimental) */ +#define MDB_FIXEDMAP 0x01 +/** no environment directory */ +#define MDB_NOSUBDIR 0x4000 +/** don't fsync after commit */ +#define MDB_NOSYNC 0x10000 +/** read only */ +#define MDB_RDONLY 0x20000 +/** don't fsync metapage after commit */ +#define MDB_NOMETASYNC 0x40000 +/** use writable mmap */ +#define MDB_WRITEMAP 0x80000 +/** use asynchronous msync when #MDB_WRITEMAP is used */ +#define MDB_MAPASYNC 0x100000 +/** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 +/** don't do any locking, caller must manage their own locks + * WARNING: libmdbx don't support this mode. */ +#define MDB_NOLOCK__UNSUPPORTED 0x400000 +/** don't do readahead */ +#define MDB_NORDAHEAD 0x800000 +/** don't initialize malloc'd memory before writing to datafile */ +#define MDB_NOMEMINIT 0x1000000 + +#if MDBX_MODE_ENABLED +/** aim to coalesce FreeDB records */ +#define MDBX_COALESCE 0x2000000 +/** LIFO policy for reclaiming FreeDB records */ +#define MDBX_LIFORECLAIM 0x4000000 +#endif /* MDBX_MODE_ENABLED */ + +/** make a steady-sync only on close and explicit env-sync */ +#define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC | MDB_MAPASYNC) +/** debuging option, fill/perturb released pages */ +#define MDBX_PAGEPERTURB 0x8000000 +/** @} */ + +/** @defgroup mdbx_dbi_open Database Flags + * @{ + */ +/** use reverse string keys */ +#define MDB_REVERSEKEY 0x02 +/** use sorted duplicates */ +#define MDB_DUPSORT 0x04 +/** numeric keys in native byte order, either unsigned int or #mdbx_size_t. + * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) + * The keys must all be of the same size. */ +#define MDB_INTEGERKEY 0x08 +/** with #MDB_DUPSORT, sorted dup items have fixed size */ +#define MDB_DUPFIXED 0x10 +/** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ +#define MDB_INTEGERDUP 0x20 +/** with #MDB_DUPSORT, use reverse string dups */ +#define MDB_REVERSEDUP 0x40 +/** create DB if not already existing */ +#define MDB_CREATE 0x40000 +/** @} */ + +/** @defgroup mdbx_put Write Flags + * @{ + */ +/** For put: Don't write if the key already exists. */ +#define MDB_NOOVERWRITE 0x10 +/** Only for #MDB_DUPSORT
+ * For put: don't write if the key and data pair already exist.
+ * For mdbx_cursor_del: remove all duplicate data items. + */ +#define MDB_NODUPDATA 0x20 +/** For mdbx_cursor_put: overwrite the current key/data pair + * MDBX allows this flag for mdbx_put() for explicit overwrite/update without + * insertion. */ +#define MDB_CURRENT 0x40 +/** For put: Just reserve space for data, don't copy it. Return a + * pointer to the reserved space. + */ +#define MDB_RESERVE 0x10000 +/** Data is being appended, don't split full pages. */ +#define MDB_APPEND 0x20000 +/** Duplicate data is being appended, don't split full pages. */ +#define MDB_APPENDDUP 0x40000 +/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ +#define MDB_MULTIPLE 0x80000 +/* @} */ + +/** @defgroup mdbx_copy Copy Flags + * @{ + */ +/** Compacting copy: Omit free space from copy, and renumber all + * pages sequentially. + */ +#define MDB_CP_COMPACT 0x01 +/* @} */ + +/** @brief Cursor Get operations. + * + * This is the set of all operations for retrieving data + * using a cursor. + */ +typedef enum MDB_cursor_op { + MDB_FIRST, /**< Position at first key/data item */ + MDB_FIRST_DUP, /**< Position at first data item of current key. + Only for #MDB_DUPSORT */ + MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ + MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for + #MDB_DUPSORT */ + MDB_GET_CURRENT, /**< Return key/data at current cursor position */ + MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items + from current cursor position. Move + cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for + #MDB_DUPFIXED */ + MDB_LAST, /**< Position at last key/data item */ + MDB_LAST_DUP, /**< Position at last data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT, /**< Position at next data item */ + MDB_NEXT_DUP, /**< Position at next data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items + from next cursor position. Move + cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for + #MDB_DUPFIXED */ + MDB_NEXT_NODUP, /**< Position at first data item of next key */ + MDB_PREV, /**< Position at previous data item */ + MDB_PREV_DUP, /**< Position at previous data item of current key. + Only for #MDB_DUPSORT */ + MDB_PREV_NODUP, /**< Position at last data item of previous key */ + MDB_SET, /**< Position at specified key */ + MDB_SET_KEY, /**< Position at specified key, return key + data */ + MDB_SET_RANGE, /**< Position at first key greater than or equal to specified + key. */ + MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to + a page of duplicate data items. + Only for #MDB_DUPFIXED */ +} MDB_cursor_op; + +/** @defgroup errors Return Codes + * + * BerkeleyDB uses -30800 to -30999, we'll go under them + * @{ + */ +/** Successful result */ +#define MDB_SUCCESS 0 +/** key/data pair already exists */ +#define MDB_KEYEXIST (-30799) +/** key/data pair not found (EOF) */ +#define MDB_NOTFOUND (-30798) +/** Requested page not found - this usually indicates corruption */ +#define MDB_PAGE_NOTFOUND (-30797) +/** Located page was wrong type */ +#define MDB_CORRUPTED (-30796) +/** Update of meta page failed or environment had fatal error */ +#define MDB_PANIC (-30795) +/** Environment version mismatch */ +#define MDB_VERSION_MISMATCH (-30794) +/** File is not a valid LMDB file */ +#define MDB_INVALID (-30793) +/** Environment mapsize reached */ +#define MDB_MAP_FULL (-30792) +/** Environment maxdbs reached */ +#define MDB_DBS_FULL (-30791) +/** Environment maxreaders reached */ +#define MDB_READERS_FULL (-30790) +/** Txn has too many dirty pages */ +#define MDB_TXN_FULL (-30788) +/** Cursor stack too deep - internal error */ +#define MDB_CURSOR_FULL (-30787) +/** Page has not enough space - internal error */ +#define MDB_PAGE_FULL (-30786) +/** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) +/** Operation and DB incompatible, or DB type changed. This can mean: + *
    + *
  • The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. + *
  • Opening a named DB when the unnamed DB has #MDB_DUPSORT / + *#MDB_INTEGERKEY. + *
  • Accessing a data record as a database, or vice versa. + *
  • The database was dropped and recreated with different flags. + *
+ */ +#define MDB_INCOMPATIBLE (-30784) +/** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) +/** Transaction must abort, has a child, or is invalid */ +#define MDB_BAD_TXN (-30782) +/** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ +#define MDB_BAD_VALSIZE (-30781) +/** The specified DBI was changed unexpectedly */ +#define MDB_BAD_DBI (-30780) +/** Unexpected problem - txn should abort */ +#define MDB_PROBLEM (-30779) +/** The last defined error code */ +#define MDB_LAST_ERRCODE MDB_PROBLEM +/** @} */ + +/** @brief Statistics for a database in the environment */ +typedef struct MDBX_stat { + unsigned ms_psize; /**< Size of a database page. + This is currently the + same for all databases. */ + unsigned ms_depth; /**< Depth (height) of the B-tree */ + size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + size_t ms_leaf_pages; /**< Number of leaf pages */ + size_t ms_overflow_pages; /**< Number of overflow pages */ + size_t ms_entries; /**< Number of data items */ +} MDBX_stat; + +/** @brief Information about the environment */ +typedef struct MDBX_envinfo { + void *me_mapaddr; /**< Address of map, if fixed */ + size_t me_mapsize; /**< Size of the data memory map */ + size_t me_last_pgno; /**< ID of the last used page */ + size_t me_last_txnid; /**< ID of the last committed transaction */ + unsigned me_maxreaders; /**< max reader slots in the environment */ + unsigned me_numreaders; /**< max reader slots used in the environment */ + size_t me_tail_txnid; /**< ID of the last reader transaction */ + size_t me_meta1_txnid, me_meta1_sign; + size_t me_meta2_txnid, me_meta2_sign; +} MDBX_envinfo; + +/** @brief Return the LMDB library version information. + * + * @param[out] major if non-NULL, the library major version number is copied + * here + * @param[out] minor if non-NULL, the library minor version number is copied + * here + * @param[out] patch if non-NULL, the library patch version number is copied + * here + * @retval "version string" The library version as a string + */ +char *mdbx_version(int *major, int *minor, int *patch); + +/** @brief Return a string describing a given error code. + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function strerror(3) is returned. If the error code + * is less than 0, an error string corresponding to the LMDB library error is + * returned. See @ref errors for a list of LMDB-specific error codes. + * @param[in] err The error code + * @retval "error message" The description of the error + */ +char *mdbx_strerror(int err); + +/** @brief Create an LMDB environment handle. + * + * This function allocates memory for a #MDB_env structure. To release + * the allocated memory and discard the handle, call #mdbx_env_close(). + * Before the handle may be used, it must be opened using #mdbx_env_open(). + * Various other options may also need to be set before opening the handle, + * e.g. #mdbx_env_set_mapsize(), #mdbx_env_set_maxreaders(), + * #mdbx_env_set_maxdbs(), + * depending on usage requirements. + * @param[out] env The address where the new handle will be stored + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_create(MDB_env **env); + +/** @brief Open an environment handle. + * + * If this function fails, #mdbx_env_close() must be called to discard the + *#MDB_env handle. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] path The directory in which the database files reside. This + * directory must already exist and be writable. + * @param[in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * Flags set by mdbx_env_set_flags() are also used. + *
    + *
  • #MDB_FIXEDMAP + * use a fixed address for the mmap region. This flag must be specified + * when creating the environment, and is stored persistently in the + *environment. + * If successful, the memory map will always reside at the same + *virtual address + * and pointers used to reference data items in the database will + *be constant + * across multiple invocations. This option may not always work, + *depending on + * how the operating system has allocated memory to shared + *libraries and other uses. + * The feature is highly experimental. + *
  • #MDB_NOSUBDIR + * By default, LMDB creates its environment in a directory whose + * pathname is given in \b path, and creates its data and lock + *files + * under that directory. With this option, \b path is used as-is + *for + * the database main data file. The database lock file is the \b + *path + * with "-lock" appended. + *
  • #MDB_RDONLY + * Open the environment in read-only mode. No write operations will + *be + * allowed. LMDB will still modify the lock file - except on + *read-only + * filesystems, where LMDB does not use locks. + *
  • #MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This uses + * fewer mallocs but loses protection from application bugs + * like wild pointer writes and other bad updates into the + *database. + * This may be slightly faster for DBs that fit entirely in RAM, + *but + * is slower for DBs larger than RAM. + * Incompatible with nested transactions. + * Do not mix processes with and without MDB_WRITEMAP on the same + * environment. This can defeat durability (#mdbx_env_sync etc). + *
  • #MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit + *the + * metadata flush. Defer that until the system flushes files to + *disk, + * or next non-MDB_RDONLY commit or #mdbx_env_sync(). This + *optimization + * maintains database integrity, but a system crash may undo the + *last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database + *property. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDB_NOSYNC + * Don't flush system buffers to disk when committing a + *transaction. + * This optimization means a system crash can corrupt the database + *or + * lose the last transactions if buffers are not yet flushed to + *disk. + * The risk is governed by how often the system flushes dirty + *buffers + * to disk and how often #mdbx_env_sync() is called. However, if + *the + * filesystem preserves write order and the #MDB_WRITEMAP flag is + *not + * used, transactions exhibit ACI (atomicity, consistency, + *isolation) + * properties and only lose D (durability). I.e. database + *integrity + * is maintained, but a system crash may undo the final + *transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with + *no + * hint for when to write transactions to disk, unless + *#mdbx_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling + *#mdbx_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdbx_txn_reset() + *keeps + * the slot reseved for the #MDB_txn object. A thread may use + *parallel + * read-only transactions. A read-only transaction may span threads + *if + * the user synchronizes its use. Applications that multiplex + *many + * user threads over individual OS threads need this option. Such + *an + * application must also serialize the write transactions in an + *OS + * thread, since LMDB's write locking is unaware of the user + *threads. + *
  • #MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper + *operation + * the caller must enforce single-writer semantics, and must + *ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so + *that + * no readers may be active at all when a writer begins. + *
  • #MDB_NORDAHEAD + * Turn off readahead. Most operating systems perform readahead + *on + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + *
  • #MDB_NOMEMINIT + * Don't initialize malloc'd memory before writing to unused + *spaces + * in the data file. By default, memory for pages written to the + *data + * file is obtained using malloc. While these pages may be reused + *in + * subsequent transactions, freshly malloc'd pages will be + *initialized + * to zeroes before use. This avoids persisting leftover data from + *other + * code (that used the heap and subsequently freed the memory) into + *the + * data file. Note that many other system libraries may allocate + * and free memory from the heap for arbitrary uses. E.g., stdio + *may + * use the heap for file I/O buffers. This initialization step has + *a + * modest performance cost so some applications may want to + *disable + * it using this flag. This option can be a problem for + *applications + * which handle sensitive data like passwords, and it makes + *memory + * checkers like Valgrind noisy. This flag is not needed with + *#MDB_WRITEMAP, + * which writes directly to the mmap instead of using malloc for + *pages. The + * initialization is also skipped if #MDB_RESERVE is used; the + * caller is expected to overwrite all of the memory that was + * reserved in that case. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDBX_COALESCE + * Aim to coalesce records while reclaiming FreeDB. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDBX_LIFORECLAIM + * LIFO policy for reclaiming FreeDB records. This significantly + *reduce + * write IPOS in case MDB_NOSYNC with periodically checkpoints. + *
+ * @param[in] mode The UNIX permissions to set on created files and + *semaphores. + * This parameter is ignored on Windows. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't + *match the + * version that created the database environment. + *
  • #MDB_INVALID - the environment file headers are corrupted. + *
  • ENOENT - the directory specified by the path parameter doesn't + *exist. + *
  • EACCES - the user didn't have permission to access the environment + *files. + *
  • EAGAIN - the environment was locked by another process. + *
+ */ +int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); +int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, + mode_t mode, int *exclusive); + +/** @brief Copy an LMDB environment to the specified path. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copy(MDB_env *env, const char *path); + +/** @brief Copy an LMDB environment to the specified file descriptor. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copyfd(MDB_env *env, int fd); + +/** @brief Copy an LMDB environment to the specified path, with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free + * pages and sequentially renumber all pages in output. This + *option + * consumes more CPU and runs more slowly than the default. + * Currently it fails if the environment has suffered a page + *leak. + *
+ * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); + +/** @brief Copy an LMDB environment to the specified file descriptor, + * with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. See + * #mdbx_env_copy2() for further details. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @param[in] flags Special options for this operation. + * See #mdbx_env_copy2() for options. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copyfd2(MDB_env *env, int fd, unsigned flags); + +/** @brief Return statistics about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + */ int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); -int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); + +/** @brief Return information about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] stat The address of an #MDB_envinfo structure + * where the information will be copied + */ int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); + +/** @brief Flush the data buffers to disk. + * + * Data is always written to disk when #mdbx_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is + * not valid if the environment was opened with #MDB_RDONLY. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] force If non-zero, force a synchronous flush. Otherwise + * if the environment has the #MDB_NOSYNC flag set the flushes + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - the environment is read-only. + *
  • EINVAL - an invalid parameter was specified. + *
  • EIO - an error occurred during synchronization. + *
+ */ +int mdbx_env_sync(MDB_env *env, int force); + +/** @brief Close the environment and release the memory map. + * + * Only a single thread may call this function. All transactions, databases, + * and cursors must already be closed before calling this function. Attempts + * to + * use any such handles after calling this function will cause a SIGSEGV. + * The environment handle will be freed and must not be used again after this + * call. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] dont_sync A dont'sync flag, if non-zero the last checkpoint + * (meta-page update) will be kept "as is" and may be still "weak" + * in NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored + * on opening next time, and transactions since the last non-weak + * checkpoint (meta-page update) will rolledback for consistency guarantee. + */ +void mdbx_env_close(MDB_env *env); + +/** @brief Set environment flags. + * + * This may be used to set some flags in addition to those from + * #mdbx_env_open(), or to unset these flags. If several threads + * change the flags at the same time, the result is undefined. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] flags The flags to change, bitwise OR'ed together + * @param[in] onoff A non-zero value sets the flags, zero clears them. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); + +/** @brief Get environment flags. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] flags The address of an integer to store the flags + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_flags(MDB_env *env, unsigned *flags); + +/** @brief Return the path that was used in #mdbx_env_open(). + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] path Address of a string pointer to contain the path. This + * is the actual string in the environment, not a copy. It should not be + * altered in any way. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_path(MDB_env *env, const char **path); + +/** @brief Return the filedescriptor for the given environment. + * + * This function may be called after fork(), so the descriptor can be + * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. + * (Until LMDB 0.9.18, only the lockfile had that.) + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] fd Address of a int to contain the descriptor. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_fd(MDB_env *env, int *fd); + +/** @brief Set the size of the memory map to use for this environment. + * + * The size should be a multiple of the OS page size. The default is + * 10485760 bytes. The size of the memory map is also the maximum size + * of the database. The value should be chosen as large as possible, + * to accommodate future growth of the database. + * This function should be called after #mdbx_env_create() and before + *#mdbx_env_open(). + * It may be called at later times if no transactions are active in + * this process. Note that the library does not check for this condition, + * the caller must ensure it explicitly. + * + * The new size takes effect immediately for the current process but + * will not be persisted to any others until a write transaction has been + * committed by the current process. Also, only mapsize increases are + * persisted into the environment. + * + * If the mapsize is increased by another process, and data has grown + * beyond the range of the current mapsize, #mdbx_txn_begin() will + * return #MDB_MAP_RESIZED. This function may be called with a size + * of zero to adopt the new size. + * + * Any attempt to set a size smaller than the space already consumed + * by the environment will be silently changed to the current size of the used + *space. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] size The size in bytes + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment + *has + * an active write transaction. + *
+ */ +int mdbx_env_set_mapsize(MDB_env *env, size_t size); + +/** @brief Set the maximum number of threads/reader slots for the environment. + * + * This defines the number of slots in the lock table that is used to track + *readers in the + * the environment. The default is 126. + * Starting a read-only transaction normally ties a lock table slot to the + * current thread until the environment closes or the thread exits. If + * MDB_NOTLS is in use, #mdbx_txn_begin() instead ties the slot to the + * MDB_txn object until it or the #MDB_env object is destroyed. + * This function may only be called after #mdbx_env_create() and before + *#mdbx_env_open(). + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] readers The maximum number of reader lock table slots + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is + *already open. + *
+ */ +int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); + +/** @brief Get the maximum number of threads/reader slots for the environment. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] readers Address of an integer to store the number of readers + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); + +/** @brief Set the maximum number of named databases for the environment. + * + * This function is only needed if multiple databases will be used in the + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. + * This function may only be called after #mdbx_env_create() and before + *#mdbx_env_open(). + * + * Currently a moderate number of slots are cheap but a huge number gets + * expensive: 7-120 words per transaction, and every #mdbx_dbi_open() + * does a linear search of the opened slots. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] dbs The maximum number of databases + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is + *already open. + *
+ */ +int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); + +/** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. + * + * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. + * See @ref MDB_val. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @return The maximum size of a key we can write + */ +int mdbx_env_get_maxkeysize(MDB_env *env); + +/** @brief Set application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_set_userctx(MDB_env *env, void *ctx); + +/** @brief Get the application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @return The pointer set by #mdbx_env_set_userctx(). + */ +void *mdbx_env_get_userctx(MDB_env *env); + +/** @brief A callback function for most LMDB assert() failures, + * called before printing the message and aborting. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] msg The assertion message, not including newline. + */ +typedef void MDB_assert_func(MDB_env *env, const char *msg, + const char *function, unsigned line); + +/** Set or reset the assert() callback of the environment. + * Disabled if liblmdb is buillt with MDB_DEBUG=0. + * @note This hack should become obsolete as lmdb's error handling matures. + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] func An #MDB_assert_func function, or 0. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); + +/** @brief Create a transaction for use with the environment. + * + * The transaction handle may be discarded using #mdbx_txn_abort() or + *#mdbx_txn_commit(). + * @note A transaction and its cursors must only be used by a single + * thread, and a thread may only have a single transaction at a time. + * If #MDB_NOTLS is in use, this does not apply to read-only transactions. + * @note Cursors may not span transactions. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] parent If this parameter is non-NULL, the new transaction + * will be a nested transaction, with the transaction indicated by \b parent + * as its parent. Transactions may be nested to any level. A parent + * transaction and its cursors may not issue any other operations than + * mdbx_txn_commit and mdbx_txn_abort while it has active child transactions. + * @param[in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_RDONLY + * This transaction will not perform any write operations. + *
+ * @param[out] txn Address where the new #MDB_txn handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • #MDB_MAP_RESIZED - another process wrote data beyond this + *MDB_env's + * mapsize and this environment's map must be resized as well. + * See #mdbx_env_set_mapsize(). + *
  • #MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See #mdbx_env_set_maxreaders(). + *
  • ENOMEM - out of memory. + *
+ */ +int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, + MDB_txn **txn); + +/** @brief Returns the transaction's #MDB_env + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + */ +MDB_env *mdbx_txn_env(MDB_txn *txn); + +/** @brief Return the transaction's ID. + * + * This returns the identifier associated with this transaction. For a + * read-only transaction, this corresponds to the snapshot being read; + * concurrent readers will frequently have the same transaction ID. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @return A transaction ID, valid if input is an active transaction. + */ +size_t mdbx_txn_id(MDB_txn *txn); + +/** @brief Commit all the operations of a transaction into the database. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdbx_cursor_renew(). + * + * @note MDBX-mode: + * A cursor must be closed explicitly always, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * + * @note LMDB-compatible mode: + * Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
  • ENOSPC - no more disk space. + *
  • EIO - a low-level I/O error occurred while writing. + *
  • ENOMEM - out of memory. + *
+ */ +int mdbx_txn_commit(MDB_txn *txn); + +/** @brief Abandon all the operations of the transaction instead of saving + * them. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdbx_cursor_renew(). + * + * @note MDBX-mode: + * A cursor must be closed explicitly always, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * + * @note LMDB-compatible mode: + * Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + */ +int mdbx_txn_abort(MDB_txn *txn); + +/** @brief Reset a read-only transaction. + * + * Abort the transaction like #mdbx_txn_abort(), but keep the transaction + * handle. #mdbx_txn_renew() may reuse the handle. This saves allocation + * overhead if the process will start a new read-only transaction soon, + * and also locking overhead if #MDB_NOTLS is in use. The reader table + * lock is released, but the table slot stays tied to its thread or + * #MDB_txn. Use mdbx_txn_abort() to discard a reset handle, and to free + * its lock table slot if MDB_NOTLS is in use. + * Cursors opened within the transaction must not be used + * again after this call, except with #mdbx_cursor_renew(). + * Reader locks generally don't interfere with writers, but they keep old + * versions of database pages allocated. Thus they prevent the old pages + * from being reused when writers commit new data, and so under heavy load + * the database size may grow much more rapidly than otherwise. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + */ +int mdbx_txn_reset(MDB_txn *txn); + +/** @brief Renew a read-only transaction. + * + * This acquires a new reader lock for a transaction handle that had been + * released by #mdbx_txn_reset(). It must be called before a reset transaction + * may be used again. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_txn_renew(MDB_txn *txn); + +/** @brief Open a database in the environment. + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. + * The database handle may be discarded by calling #mdbx_dbi_close(). + * The old database handle is returned if the database was already open. + * The handle may only be closed once. + * + * The database handle will be private to the current transaction until + * the transaction is successfully committed. If the transaction is + * aborted the handle will be closed automatically. + * After a successful commit the handle will reside in the shared + * environment, and may be used by other transactions. + * + * This function must not be called from multiple concurrent + * transactions in the same process. A transaction that uses + * this function must finish (either commit or abort) before + * any other transaction in the process may use this function. + * + * To use named databases (with name != NULL), #mdbx_env_set_maxdbs() + * must be called before opening the environment. Database names are + * keys in the unnamed database, and may be read but not written. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] name The name of the database to open. If only a single + * database is needed in the environment, this value may be NULL. + * @param[in] flags Special options for this database. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as + *strings and + * compared from beginning to end. + *
  • #MDB_DUPSORT + * Duplicate keys may be used in the database. (Or, from another + *perspective, + * keys may have multiple data items, stored in sorted order.) By + *default + * keys must be unique and may have only a single data item. + *
  • #MDB_INTEGERKEY + * Keys are binary integers in native byte order, either unsigned + *int + * or #mdbx_size_t, and will be sorted as such. + * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) + * The keys must all be of the same size. + *
  • #MDB_DUPFIXED + * This flag may only be used in combination with #MDB_DUPSORT. + *This option + * tells the library that the data items for this database are all + *the same + * size, which allows further optimizations in storage and + *retrieval. When + * all data items are the same size, the #MDB_GET_MULTIPLE, + *#MDB_NEXT_MULTIPLE + * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve + *multiple + * items at once. + *
  • #MDB_INTEGERDUP + * This option specifies that duplicate data items are binary + *integers, + * similar to #MDB_INTEGERKEY keys. + *
  • #MDB_REVERSEDUP + * This option specifies that duplicate data items should be + *compared as + * strings in reverse order. + *
  • #MDB_CREATE + * Create the named database if it doesn't exist. This option is + *not + * allowed in a read-only transaction or a read-only environment. + *
+ * @param[out] dbi Address where the new #MDB_dbi handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the specified database doesn't exist in the + *environment + * and #MDB_CREATE was not specified. + *
  • #MDB_DBS_FULL - too many databases have been opened. See + *#mdbx_env_set_maxdbs(). + *
+ */ +int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); + +/** @brief Retrieve statistics for a database. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); + +/** @brief Retrieve the DB flags for a database handle. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[out] flags Address where the flags will be returned. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); + +/** @brief Close a database handle. Normally unnecessary. Use with care: + * + * This call is not mutex protected. Handles should only be closed by + * a single thread, and only if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close + * a handle if an existing transaction has modified its database. + * Doing so can cause misbehavior from database corruption to errors + * like MDB_BAD_VALSIZE (since the DB name is gone). + * + * Closing a database handle is not necessary, but lets #mdbx_dbi_open() + * reuse the handle value. Usually it's better to set a bigger + * #mdbx_env_set_maxdbs(), unless that value would be large. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + */ +void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); + +/** @brief Empty or delete+close a database. + * + * See #mdbx_dbi_close() for restrictions about closing the DB handle. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] del 0 to empty the DB, 1 to delete it from the + * environment and close the DB handle. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); + +/** @brief Set a custom key comparison function for a database. + * + * The comparison function is called whenever it is necessary to compare a + * key specified by the application with a key currently stored in the + *database. + * If no comparison function is specified, and no special key flags were + *specified + * with #mdbx_dbi_open(), the keys are compared lexically, with shorter keys + *collating + * before longer keys. + * @warning This function must be called before any data access functions are + *used, + * otherwise data corruption may occur. The same comparison function must be + *used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + +/** @brief Set a custom data comparison function for a #MDB_DUPSORT database. + * + * This comparison function is called whenever it is necessary to compare a + *data + * item specified by the application with a data item currently stored in the + *database. + * This function only takes effect if the database was opened with the + *#MDB_DUPSORT + * flag. + * If no comparison function is specified, and no special key flags were + *specified + * with #mdbx_dbi_open(), the data items are compared lexically, with shorter + *items collating + * before longer items. + * @warning This function must be called before any data access functions are + *used, + * otherwise data corruption may occur. The same comparison function must be + *used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + +/** @brief Set a relocation function for a #MDB_FIXEDMAP database. + * + * @todo The relocation function is called whenever it is necessary to move + *the data + * of an item to a different position in the database (e.g. through tree + * balancing operations, shifts as a result of adds or deletes, etc.). It is + * intended to allow address/position-dependent data items to be stored in + * a database in an environment opened with the #MDB_FIXEDMAP option. + * Currently the relocation feature is unimplemented and setting + * this function has no effect. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] rel A #MDB_rel_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); + +/** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation + *function. + * + * See #mdbx_set_relfunc and #MDB_rel_func for more details. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * It will be passed to the callback function set by #mdbx_set_relfunc + * as its \b relctx parameter whenever the callback is invoked. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); + +/** @brief Get items from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified \b key are returned + * in the structure to which \b data refers. + * If the database supports duplicate keys (#MDB_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of #mdbx_cursor_get(). + * + * @note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a SIGSEGV. + * @note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] key The key to search for in the database + * @param[out] data The data corresponding to the key + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the key was not in the database. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + +/** @brief Store items into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed, or adding a duplicate data item if + * duplicates are allowed (#MDB_DUPSORT). + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] key The key to store in the database + * @param[in,out] data The data to store + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be + *specified + * if the database was opened with #MDB_DUPSORT. The function + *will + * return #MDB_KEYEXIST if the key/data pair already appears in + *the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will + *return + * #MDB_KEYEXIST if the key already appears in the database, even + *if + * the database supports duplicates (#MDB_DUPSORT). The \b data + * parameter will be set to point to the existing item. + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. + * LMDB does nothing else with this memory, the caller is + *expected + * to modify all of the space requested. This flag must not be + * specified if the database was opened with #MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. This option allows fast bulk loading when keys are + * already known to be in the correct order. Loading unsorted + *keys + * with this flag will cause a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdbx_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags); + +/** @brief Delete items from a database. + * + * This function removes key/data pairs from the database. + * + * MDBX-mode: + * The data parameter is NOT ignored regardless the database does + * support sorted duplicate data items or not. If the data parameter + * is non-NULL only the matching data item will be deleted. + * + * LMDB-compatible mode: + * If the database does not support sorted duplicate data items + * (#MDB_DUPSORT) the data parameter is ignored. + * If the database supports sorted duplicates and the data parameter + * is NULL, all of the duplicate data items for the key will be + * deleted. Otherwise, if the data parameter is non-NULL + * only the matching data item will be deleted. + * + * This function will return #MDB_NOTFOUND if the specified key/data + * pair is not in the database. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] key The key to delete from the database + * @param[in] data The data to delete + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + +/** @brief Create a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with #mdbx_cursor_renew(). + * It can be discarded with #mdbx_cursor_close(). + * + * MDBX-mode: + * A cursor must be closed explicitly always, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * + * LMDB-compatible mode: + * A cursor in a write-transaction can be closed before its transaction + * ends, and will otherwise be closed when its transaction ends. + * A cursor in a read-only transaction must be closed explicitly, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * @note Earlier documentation said that cursors in every transaction + * were closed when the transaction committed or aborted. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[out] cursor Address where the new #MDB_cursor handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); + +/** @brief Close a cursor handle. + * + * The cursor handle will be freed and must not be used again after this call. + * Its transaction must still be live if it is a write-transaction. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + */ +void mdbx_cursor_close(MDB_cursor *cursor); + +/** @brief Renew a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * Cursors that are only used in read-only + * transactions may be re-used, to avoid unnecessary malloc/free overhead. + * The cursor may be associated with a new read-only transaction, and + * referencing the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); + +/** @brief Return the cursor's transaction handle. + * + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + */ +MDB_txn *mdbx_cursor_txn(MDB_cursor *cursor); + +/** @brief Return the cursor's database handle. + * + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + */ +MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); + +/** @brief Retrieve by cursor. + * + * This function retrieves key/data pairs from the database. The address and + *length + * of the key are returned in the object to which \b key refers (except for + *the + * case of the #MDB_SET option, in which the \b key object is unchanged), and + * the address and length of the data are returned in the object to which \b + *data + * refers. + * See #mdbx_get() for restrictions on using the output values. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[in,out] key The key for a retrieved item + * @param[in,out] data The data of a retrieved item + * @param[in] op A cursor operation #MDB_cursor_op + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - no matching key found. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); + +/** @brief Store by cursor. + * + * This function stores key/data pairs into the database. + * The cursor is positioned at the new item, or on failure usually near it. + * @note Earlier documentation incorrectly said errors would leave the + * state of the cursor unchanged. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[in] key The key operated on. + * @param[in] data The data operated on. + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_CURRENT - replace the item at the current cursor position. + * The \b key parameter must still be provided, and must match + *it. + * If using sorted duplicates (#MDB_DUPSORT) the data item must + *still + * sort into the same place. This is intended to be used when the + * new data is the same size as the old. Otherwise it will simply + * perform a delete of the old record followed by an insert. + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be + *specified + * if the database was opened with #MDB_DUPSORT. The function + *will + * return #MDB_KEYEXIST if the key/data pair already appears in + *the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will + *return + * #MDB_KEYEXIST if the key already appears in the database, even + *if + * the database supports duplicates (#MDB_DUPSORT). + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. This + *flag + * must not be specified if the database was opened with + *#MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the + *database + * was opened with #MDB_DUPFIXED. The \b data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must + *be + * the size of a single data element. The mv_data of the first + *MDB_val + * must point to the beginning of the array of contiguous data + *elements. + * The mv_size of the second MDB_val must be the count of the + *number + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The + *mv_data + * of the second MDB_val is unused. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdbx_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + unsigned flags); + +/** @brief Delete current key/data pair + * + * This function deletes the key/data pair to which the cursor refers. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_NODUPDATA - delete all of the data items for the current key. + * This flag may only be specified if the database was opened with + *#MDB_DUPSORT. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); + +/** @brief Return count of duplicates for current key. + * + * This call is only valid on databases that support sorted duplicate + * data items #MDB_DUPSORT. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[out] countp Address where the count will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - cursor is not initialized, or an invalid parameter was + *specified. + *
+ */ +int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); + +/** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two data items were keys in the + * specified database. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + +/** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two items were data items of + * the specified database. The database must have the #MDB_DUPSORT flag. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to print a message from the library. + * + * @param[in] msg The string to be printed. + * @param[in] ctx An arbitrary context pointer for the callback. + * @return < 0 on failure, >= 0 on success. + */ +typedef int(MDB_msg_func)(const char *msg, void *ctx); + +/** @brief Dump the entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] func A #MDB_msg_func function + * @param[in] ctx Anything the message function needs + * @return < 0 on failure, >= 0 on success. + */ +int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); + +/** @brief Check for stale entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] dead Number of stale slots that were cleared + * @return 0 on success, non-zero on failure. + */ +int mdbx_reader_check(MDB_env *env, int *dead); + +char *mdbx_dkey(MDB_val *key, char *buf); + int mdbx_env_close_ex(MDB_env *env, int dont_sync); - /** @brief Set threshold to force flush the data buffers to disk, - * even of #MDB_NOSYNC, #MDB_NOMETASYNC and #MDB_MAPASYNC flags - * in the environment. - * - * Data is always written to disk when #mdb_txn_commit() is called, - * but the operating system may keep it buffered. LMDB always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. - * - * The default is 0, than mean no any threshold checked, - * and no additional flush will be made. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] bytes The size in bytes of summary changes - * when a synchronous flush would be made. - * @return A non-zero error value on failure and 0 on success. - */ +/** @brief Set threshold to force flush the data buffers to disk, + * even of #MDB_NOSYNC, #MDB_NOMETASYNC and #MDB_MAPASYNC flags + * in the environment. + * + * Data is always written to disk when #mdbx_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. + * + * The default is 0, than mean no any threshold checked, + * and no additional flush will be made. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] bytes The size in bytes of summary changes + * when a synchronous flush would be made. + * @return A non-zero error value on failure and 0 on success. + */ int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); - /** @brief Returns a lag of the reading. - * - * Returns an information for estimate how much given read-only - * transaction is lagging relative the to actual head. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[out] percent Percentage of page allocation in the database. - * @return Number of transactions committed after the given was started for read, or -1 on failure. - */ +/** @brief Returns a lag of the reading. + * + * Returns an information for estimate how much given read-only + * transaction is lagging relative the to actual head. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[out] percent Percentage of page allocation in the database. + * @return Number of transactions committed after the given was started for + * read, or -1 on failure. + */ int mdbx_txn_straggler(MDB_txn *txn, int *percent); - /** @brief A callback function for killing a laggard readers, - * but also could waiting ones. Called in case of MDB_MAP_FULL error. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] pid pid of the reader process. - * @param[in] thread_id thread_id of the reader thread. - * @param[in] txn Transaction number on which stalled. - * @param[in] gap a lag from the last commited txn. - * @param[in] retry a retry number, less that zero for notify end of OOM-loop. - * @return -1 on failure (reader is not killed), - * 0 on a race condition (no such reader), - * 1 on success (reader was killed), - * >1 on success (reader was SURE killed). - */ -typedef int (MDBX_oom_func)(MDB_env *env, int pid, void* thread_id, size_t txn, unsigned gap, int retry); +/** @brief A callback function for killing a laggard readers, + * but also could waiting ones. Called in case of MDB_MAP_FULL error. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] pid pid of the reader process. + * @param[in] thread_id thread_id of the reader thread. + * @param[in] txn Transaction number on which stalled. + * @param[in] gap a lag from the last commited txn. + * @param[in] retry a retry number, less that zero for notify end of OOM-loop. + * @return -1 on failure (reader is not killed), + * 0 on a race condition (no such reader), + * 1 on success (reader was killed), + * >1 on success (reader was SURE killed). + */ +typedef int(MDBX_oom_func)(MDB_env *env, int pid, void *thread_id, size_t txn, + unsigned gap, int retry); - /** @brief Set the OOM callback. - * - * Callback will be called only on out-of-pages case for killing - * a laggard readers to allowing reclaiming of freeDB. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] oomfunc A #MDBX_oom_func function or NULL to disable. - */ +/** @brief Set the OOM callback. + * + * Callback will be called only on out-of-pages case for killing + * a laggard readers to allowing reclaiming of freeDB. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] oomfunc A #MDBX_oom_func function or NULL to disable. + */ void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); - /** @brief Get the current oom_func callback. - * - * Callback will be called only on out-of-pages case for killing - * a laggard readers to allowing reclaiming of freeDB. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @return A #MDBX_oom_func function or NULL if disabled. - */ -MDBX_oom_func* mdbx_env_get_oomfunc(MDB_env *env); +/** @brief Get the current oom_func callback. + * + * Callback will be called only on out-of-pages case for killing + * a laggard readers to allowing reclaiming of freeDB. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @return A #MDBX_oom_func function or NULL if disabled. + */ +MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); -#define MDBX_DBG_ASSERT 1 -#define MDBX_DBG_PRINT 2 -#define MDBX_DBG_TRACE 4 -#define MDBX_DBG_EXTRA 8 -#define MDBX_DBG_AUDIT 16 -#define MDBX_DBG_EDGE 32 +#define MDBX_DBG_ASSERT 1 +#define MDBX_DBG_PRINT 2 +#define MDBX_DBG_TRACE 4 +#define MDBX_DBG_EXTRA 8 +#define MDBX_DBG_AUDIT 16 +#define MDBX_DBG_EDGE 32 /* LY: a "don't touch" value */ -#define MDBX_DBG_DNT (-1L) +#define MDBX_DBG_DNT (-1L) typedef void MDBX_debug_func(int type, const char *function, int line, - const char *msg, va_list args); + const char *msg, va_list args); -int mdbx_setup_debug(int flags, MDBX_debug_func* logger, long edge_txn); +int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); -typedef int MDBX_pgvisitor_func(size_t pgno, unsigned pgnumber, void* ctx, - const char* dbi, const char *type, int nentries, - int payload_bytes, int header_bytes, int unused_bytes); -int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func* visitor, void* ctx); +typedef int MDBX_pgvisitor_func(size_t pgno, unsigned pgnumber, void *ctx, + const char *dbi, const char *type, int nentries, + int payload_bytes, int header_bytes, + int unused_bytes); +int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); -typedef struct mdbx_canary { - size_t x, y, z, v; -} mdbx_canary; +typedef struct mdbx_canary { size_t x, y, z, v; } mdbx_canary; -int mdbx_canary_put(MDB_txn *txn, const mdbx_canary* canary); -size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary* canary); +int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); +size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); /* Returns: * - MDBX_RESULT_TRUE when no more data available @@ -236,20 +1768,19 @@ int mdbx_cursor_on_last(MDB_cursor *mc); #define MDBX_RESULT_FALSE MDB_SUCCESS #define MDBX_RESULT_TRUE (-1) -int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags); +int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, + MDB_val *old_data, unsigned flags); /* Same as mdbx_get(), but: * 1) if values_count is not NULL, then returns the count * of multi-values/duplicates for a given key. * 2) updates the key for pointing to the actual key's data inside DB. */ -int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, int* values_count); +int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + int *values_count); -int mdbx_is_dirty(const MDB_txn *txn, const void* ptr); +int mdbx_is_dirty(const MDB_txn *txn, const void *ptr); int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); - -/** @} */ + MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); #ifdef __cplusplus } diff --git a/mdbx_chk.c b/mdbx_chk.c new file mode 100644 index 00000000..6c1f6454 --- /dev/null +++ b/mdbx_chk.c @@ -0,0 +1,979 @@ +/* mdbx_chk.c - memory-mapped database check tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2015,2016 Peter-Service R&D LLC. + * + * This file is part of libmdbx. + * + * libmdbx is free software; you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * libmdbx is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdbx.h" +#include "midl.h" + +typedef struct flagbit { + int bit; + char *name; +} flagbit; + +flagbit dbflags[] = {{MDB_DUPSORT, "dupsort"}, + {MDB_INTEGERKEY, "integerkey"}, + {MDB_REVERSEKEY, "reversekey"}, + {MDB_DUPFIXED, "dupfixed"}, + {MDB_REVERSEDUP, "reversedup"}, + {MDB_INTEGERDUP, "integerdup"}, + {0, NULL}}; + +static volatile sig_atomic_t gotsignal; + +static void signal_handler(int sig) { + (void)sig; + gotsignal = 1; +} + +#define MAX_DBI 32768 + +#define EXIT_INTERRUPTED (EXIT_FAILURE + 4) +#define EXIT_FAILURE_SYS (EXIT_FAILURE + 3) +#define EXIT_FAILURE_MDB (EXIT_FAILURE + 2) +#define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1) +#define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE + +struct { + const char *dbi_names[MAX_DBI]; + size_t dbi_pages[MAX_DBI]; + size_t dbi_empty_pages[MAX_DBI]; + size_t dbi_payload_bytes[MAX_DBI]; + size_t dbi_lost_bytes[MAX_DBI]; + short *pagemap; + size_t total_payload_bytes; + size_t pgcount; +} walk; + +static __attribute__((constructor)) void init_walk(void) { + walk.dbi_names[0] = "@gc"; +} + +size_t total_unused_bytes; +int exclusive = 2; + +MDB_env *env; +MDB_txn *txn, *locktxn; +MDBX_envinfo info; +MDBX_stat stat; +size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno; +size_t userdb_count, skipped_subdb; +unsigned verbose, quiet; +const char *only_subdb; + +struct problem { + struct problem *pr_next; + size_t count; + const char *caption; +}; + +struct problem *problems_list; +size_t total_problems; + +static void __attribute__((format(printf, 1, 2))) print(const char *msg, ...) { + if (!quiet) { + va_list args; + + fflush(stderr); + va_start(args, msg); + vfprintf(stdout, msg, args); + va_end(args); + } +} + +static void __attribute__((format(printf, 1, 2))) error(const char *msg, ...) { + total_problems++; + + if (!quiet) { + va_list args; + + fflush(stdout); + va_start(args, msg); + vfprintf(stderr, msg, args); + va_end(args); + fflush(NULL); + } +} + +static void pagemap_cleanup(void) { + int i; + + for (i = 1; i < MAX_DBI; ++i) { + if (walk.dbi_names[i]) { + free((void *)walk.dbi_names[i]); + walk.dbi_names[i] = NULL; + } + } + + free(walk.pagemap); + walk.pagemap = NULL; +} + +static int pagemap_lookup_dbi(const char *dbi) { + static int last; + int i; + + if (last > 0 && strcmp(walk.dbi_names[last], dbi) == 0) + return last; + + for (i = 1; walk.dbi_names[i] && last < MAX_DBI; ++i) + if (strcmp(walk.dbi_names[i], dbi) == 0) + return last = i; + + if (i == MAX_DBI) + return -1; + + walk.dbi_names[i] = strdup(dbi); + + if (verbose > 1) { + print(" - found '%s' area\n", dbi); + fflush(NULL); + } + + return last = i; +} + +static void problem_add(const char *object, size_t entry_number, + const char *msg, const char *extra, ...) { + total_problems++; + + if (!quiet) { + int need_fflush = 0; + struct problem *p; + + for (p = problems_list; p; p = p->pr_next) + if (p->caption == msg) + break; + + if (!p) { + p = calloc(1, sizeof(*p)); + p->caption = msg; + p->pr_next = problems_list; + problems_list = p; + need_fflush = 1; + } + + p->count++; + if (verbose > 1) { + print(" %s #%zu: %s", object, entry_number, msg); + if (extra) { + va_list args; + printf(" ("); + va_start(args, extra); + vfprintf(stdout, extra, args); + va_end(args); + printf(")"); + } + printf("\n"); + if (need_fflush) + fflush(NULL); + } + } +} + +static struct problem *problems_push() { + struct problem *p = problems_list; + problems_list = NULL; + return p; +} + +static size_t problems_pop(struct problem *list) { + size_t count = 0; + + if (problems_list) { + int i; + + print(" - problems: "); + for (i = 0; problems_list; ++i) { + struct problem *p = problems_list->pr_next; + count += problems_list->count; + print("%s%s (%zu)", i ? ", " : "", problems_list->caption, + problems_list->count); + free(problems_list); + problems_list = p; + } + print("\n"); + fflush(NULL); + } + + problems_list = list; + return count; +} + +static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, + const char *type, int nentries, int payload_bytes, + int header_bytes, int unused_bytes) { + (void)ctx; + + if (type) { + size_t page_bytes = payload_bytes + header_bytes + unused_bytes; + size_t page_size = pgnumber * stat.ms_psize; + int index = pagemap_lookup_dbi(dbi); + if (index < 0) + return ENOMEM; + + if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { + if (pgnumber == 1) + print(" %s-page %zu", type, pgno); + else + print(" %s-span %zu[%u]", type, pgno, pgnumber); + print(" of %s: header %i, payload %i, unused %i\n", dbi, header_bytes, + payload_bytes, unused_bytes); + } + + walk.pgcount += pgnumber; + + if (unused_bytes < 0 || (size_t)unused_bytes > page_size) + problem_add("page", pgno, "illegal unused-bytes", "%zu < %i < %zu", 0, + unused_bytes, stat.ms_psize); + + if (header_bytes < (int)sizeof(long) || + (size_t)header_bytes >= stat.ms_psize - sizeof(long)) + problem_add("page", pgno, "illegal header-length", "%zu < %i < %zu", + sizeof(long), header_bytes, stat.ms_psize - sizeof(long)); + if (payload_bytes < 1) { + if (nentries > 1) { + problem_add("page", pgno, "zero size-of-entry", + "payload %i bytes, %i entries", payload_bytes, nentries); + if ((size_t)header_bytes + unused_bytes < page_size) { + /* LY: hush a misuse error */ + page_bytes = page_size; + } + } else { + problem_add("page", pgno, "empty", "payload %i bytes, %i entries", + payload_bytes, nentries); + walk.dbi_empty_pages[index] += 1; + } + } + + if (page_bytes != page_size) { + problem_add("page", pgno, "misused", "%zu != %zu (%ih + %ip + %iu)", + page_size, page_bytes, header_bytes, payload_bytes, + unused_bytes); + if (page_size > page_bytes) + walk.dbi_lost_bytes[index] += page_size - page_bytes; + } else { + walk.dbi_payload_bytes[index] += payload_bytes + header_bytes; + walk.total_payload_bytes += payload_bytes + header_bytes; + } + + if (pgnumber) { + do { + if (pgno >= lastpgno) + problem_add("page", pgno, "wrong page-no", "%zu > %zi", pgno, + lastpgno); + else if (walk.pagemap[pgno]) + problem_add("page", pgno, "already used", "in %s", + walk.dbi_names[walk.pagemap[pgno]]); + else { + walk.pagemap[pgno] = index; + walk.dbi_pages[index] += 1; + } + ++pgno; + } while (--pgnumber); + } + } + + return gotsignal ? EINTR : MDB_SUCCESS; +} + +typedef int(visitor)(size_t record_number, MDB_val *key, MDB_val *data); +static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); + +static int handle_userdb(size_t record_number, MDB_val *key, MDB_val *data) { + (void)record_number; + (void)key; + (void)data; + return MDB_SUCCESS; +} + +static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { + char *bad = ""; + size_t pg, prev; + ssize_t i, number, span = 0; + size_t *iptr = data->mv_data, txnid = *(size_t *)key->mv_data; + + if (key->mv_size != sizeof(txnid)) + problem_add("entry", record_number, "wrong txn-id size", "key-size %zi", + key->mv_size); + else if (txnid < 1 || txnid > info.me_last_txnid) + problem_add("entry", record_number, "wrong txn-id", "%zu", txnid); + + if (data->mv_size < sizeof(size_t) || data->mv_size % sizeof(size_t)) + problem_add("entry", record_number, "wrong idl size", "%zu", data->mv_size); + else { + number = *iptr++; + if (number >= MDB_IDL_UM_MAX) + problem_add("entry", record_number, "wrong idl length", "%zi", number); + else if ((number + 1) * sizeof(size_t) != data->mv_size) + problem_add("entry", record_number, "mismatch idl length", "%zi != %zu", + number * sizeof(size_t), data->mv_size); + else { + freedb_pages += number; + if (info.me_tail_txnid > txnid) + reclaimable_pages += number; + for (i = number, prev = 1; --i >= 0;) { + pg = iptr[i]; + if (pg < 2 /* META_PAGE */ || pg > info.me_last_pgno) + problem_add("entry", record_number, "wrong idl entry", + "2 < %zi < %zi", pg, info.me_last_pgno); + else if (pg <= prev) { + bad = " [bad sequence]"; + problem_add("entry", record_number, "bad sequence", "%zi <= %zi", pg, + prev); + } + prev = pg; + pg += span; + for (; i >= span && iptr[i - span] == pg; span++, pg++) + ; + } + if (verbose > 2 && !only_subdb) { + print(" transaction %zu, %zd pages, maxspan %zd%s\n", + *(size_t *)key->mv_data, number, span, bad); + if (verbose > 3) { + int j = number - 1; + while (j >= 0) { + pg = iptr[j]; + for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) + ; + if (span > 1) + print(" %9zu[%zd]\n", pg, span); + else + print(" %9zu\n", pg); + } + } + } + } + } + + return MDB_SUCCESS; +} + +static int handle_maindb(size_t record_number, MDB_val *key, MDB_val *data) { + char *name; + int rc; + size_t i; + + name = key->mv_data; + for (i = 0; i < key->mv_size; ++i) { + if (name[i] < ' ') + return handle_userdb(record_number, key, data); + } + + name = malloc(key->mv_size + 1); + memcpy(name, key->mv_data, key->mv_size); + name[key->mv_size] = '\0'; + userdb_count++; + + rc = process_db(-1, name, handle_userdb, 0); + free(name); + if (rc != MDB_INCOMPATIBLE) + return rc; + + return handle_userdb(record_number, key, data); +} + +static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { + MDB_cursor *mc; + MDBX_stat ms; + MDB_val key, data; + MDB_val prev_key, prev_data; + unsigned flags; + int rc, i; + struct problem *saved_list; + size_t problems_count; + + unsigned record_count = 0, dups = 0; + size_t key_bytes = 0, data_bytes = 0; + + if (0 > (int)dbi) { + rc = mdbx_dbi_open(txn, name, 0, &dbi); + if (rc) { + if (!name || + rc != + MDB_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { + error(" - mdbx_open '%s' failed, error %d %s\n", name ? name : "main", + rc, mdbx_strerror(rc)); + } + return rc; + } + } + + if (dbi >= 2 /* CORE_DBS */ && name && only_subdb && + strcmp(only_subdb, name)) { + if (verbose) { + print("Skip processing '%s'...\n", name); + fflush(NULL); + } + skipped_subdb++; + return MDB_SUCCESS; + } + + if (!silent && verbose) { + print("Processing '%s'...\n", name ? name : "main"); + fflush(NULL); + } + + rc = mdbx_dbi_flags(txn, dbi, &flags); + if (rc) { + error(" - mdbx_dbi_flags failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc; + } + + rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); + if (rc) { + error(" - mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc; + } + + if (!silent && verbose) { + print(" - dbi-id %d, flags:", dbi); + if (!flags) + print(" none"); + else { + for (i = 0; dbflags[i].bit; i++) + if (flags & dbflags[i].bit) + print(" %s", dbflags[i].name); + } + print(" (0x%02X)\n", flags); + if (verbose > 1) { + print(" - page size %u, entries %zu\n", ms.ms_psize, ms.ms_entries); + print(" - b-tree depth %u, pages: branch %zu, leaf %zu, overflow %zu\n", + ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages, + ms.ms_overflow_pages); + } + } + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) { + error(" - mdbx_cursor_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc; + } + + saved_list = problems_push(); + prev_key.mv_data = NULL; + prev_data.mv_size = 0; + rc = mdbx_cursor_get(mc, &key, &data, MDB_FIRST); + while (rc == MDB_SUCCESS) { + if (gotsignal) { + print(" - interrupted by signal\n"); + fflush(NULL); + rc = EINTR; + goto bailout; + } + + if (key.mv_size > maxkeysize) { + problem_add("entry", record_count, "key length exceeds max-key-size", + "%zu > %zu", key.mv_size, maxkeysize); + } else if ((flags & MDB_INTEGERKEY) && key.mv_size != sizeof(size_t) && + key.mv_size != sizeof(int)) { + problem_add("entry", record_count, "wrong key length", "%zu != %zu", + key.mv_size, sizeof(size_t)); + } + + if ((flags & MDB_INTEGERDUP) && data.mv_size != sizeof(size_t) && + data.mv_size != sizeof(int)) { + problem_add("entry", record_count, "wrong data length", "%zu != %zu", + data.mv_size, sizeof(size_t)); + } + + if (prev_key.mv_data) { + if ((flags & MDB_DUPFIXED) && prev_data.mv_size != data.mv_size) { + problem_add("entry", record_count, "different data length", + "%zu != %zu", prev_data.mv_size, data.mv_size); + } + + int cmp = mdbx_cmp(txn, dbi, &prev_key, &key); + if (cmp > 0) { + problem_add("entry", record_count, "broken ordering of entries", NULL); + } else if (cmp == 0) { + ++dups; + if (!(flags & MDB_DUPSORT)) + problem_add("entry", record_count, "duplicated entries", NULL); + else if (flags & MDB_INTEGERDUP) { + cmp = mdbx_dcmp(txn, dbi, &prev_data, &data); + if (cmp > 0) + problem_add("entry", record_count, + "broken ordering of multi-values", NULL); + } + } + } else if (verbose) { + if (flags & MDB_INTEGERKEY) + print(" - fixed key-size %zu\n", key.mv_size); + if (flags & (MDB_INTEGERDUP | MDB_DUPFIXED)) + print(" - fixed data-size %zu\n", data.mv_size); + } + + if (handler) { + rc = handler(record_count, &key, &data); + if (rc) + goto bailout; + } + + record_count++; + key_bytes += key.mv_size; + data_bytes += data.mv_size; + + prev_key = key; + prev_data = data; + rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT); + } + if (rc != MDB_NOTFOUND) + error(" - mdbx_cursor_get failed, error %d %s\n", rc, mdbx_strerror(rc)); + else + rc = 0; + + if (record_count != ms.ms_entries) + problem_add("entry", record_count, "differentent number of entries", + "%zu != %zu", record_count, ms.ms_entries); +bailout: + problems_count = problems_pop(saved_list); + if (!silent && verbose) { + print(" - summary: %u records, %u dups, %zu key's bytes, %zu data's " + "bytes, %zu problems\n", + record_count, dups, key_bytes, data_bytes, problems_count); + fflush(NULL); + } + + mdbx_cursor_close(mc); + return rc || problems_count; +} + +static void usage(char *prog) { + fprintf(stderr, + "usage: %s dbpath [-V] [-v] [-n] [-q] [-w] [-c] [-d] [-s subdb]\n" + " -V\t\tshow version\n" + " -v\t\tmore verbose, could be used multiple times\n" + " -n\t\tNOSUBDIR mode for open\n" + " -q\t\tbe quiet\n" + " -w\t\tlock DB for writing while checking\n" + " -d\t\tdisable page-by-page traversal of b-tree\n" + " -s subdb\tprocess a specific subdatabase only\n" + " -c\t\tforce cooperative mode (don't try exclusive)\n", + prog); + exit(EXIT_INTERRUPTED); +} + +const char *meta_synctype(size_t sign) { + switch (sign) { + case 0: + return "no-sync/legacy"; + case 1: + return "weak"; + default: + return "steady"; + } +} + +int meta_lt(size_t txn1, size_t sign1, size_t txn2, size_t sign2) { + return ((sign1 > 1) == (sign2 > 1)) ? txn1 < txn2 : txn2 && sign2 > 1; +} + +int main(int argc, char *argv[]) { + int i, rc; + char *prog = argv[0]; + char *envname; + int envflags = MDB_RDONLY; + int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; + int dont_traversal = 0; + size_t n; + struct timespec timestamp_start, timestamp_finish; + double elapsed; + + atexit(pagemap_cleanup); + + if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { + rc = errno; + error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); + return EXIT_FAILURE_SYS; + } + + if (argc < 2) { + usage(prog); + } + + while ((i = getopt(argc, argv, "Vvqnwcds:")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(EXIT_SUCCESS); + break; + case 'v': + verbose++; + break; + case 'q': + quiet = 1; + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 'w': + envflags &= ~MDB_RDONLY; + break; + case 'c': + exclusive = 0; + break; + case 'd': + dont_traversal = 1; + break; + case 's': + if (only_subdb && strcmp(only_subdb, optarg)) + usage(prog); + only_subdb = optarg; + break; + default: + usage(prog); + } + } + + if (optind != argc - 1) + usage(prog); + +#ifdef SIGPIPE + signal(SIGPIPE, signal_handler); +#endif +#ifdef SIGHUP + signal(SIGHUP, signal_handler); +#endif + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + + envname = argv[optind]; + print("Running mdbx_chk for '%s' in %s mode...\n", envname, + (envflags & MDB_RDONLY) ? "read-only" : "write-lock"); + fflush(NULL); + + rc = mdbx_env_create(&env); + if (rc) { + error("mdbx_env_create failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc < 0 ? EXIT_FAILURE_MDB : EXIT_FAILURE_SYS; + } + + rc = mdbx_env_get_maxkeysize(env); + if (rc < 0) { + error("mdbx_env_get_maxkeysize failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto bailout; + } + maxkeysize = rc; + + rc = mdbx_env_set_maxdbs(env, MAX_DBI); + if (rc < 0) { + error("mdbx_env_set_maxdbs failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + rc = mdbx_env_open_ex(env, envname, envflags, 0664, &exclusive); + if (rc) { + error("mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + if (verbose) + print(" - %s mode\n", exclusive ? "monopolistic" : "cooperative"); + + if (!(envflags & MDB_RDONLY)) { + rc = mdbx_txn_begin(env, NULL, 0, &locktxn); + if (rc) { + error("mdbx_txn_begin(lock-write) failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto bailout; + } + } + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) { + error("mdbx_txn_begin(read-only) failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto bailout; + } + + rc = mdbx_env_info(env, &info, sizeof(info)); + if (rc) { + error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + rc = mdbx_env_stat(env, &stat, sizeof(stat)); + if (rc) { + error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + lastpgno = info.me_last_pgno + 1; + errno = 0; + + if (verbose) { + double k = 1024.0; + const char sf[] = + "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ + for (i = 0; sf[i + 1] && info.me_mapsize / k > 1000.0; ++i) + k *= 1024; + print(" - map size %zu (%.2f %cb)\n", info.me_mapsize, info.me_mapsize / k, + sf[i]); + if (info.me_mapaddr) + print(" - mapaddr %p\n", info.me_mapaddr); + print(" - pagesize %u, max keysize %zu (%s), max readers %u\n", + stat.ms_psize, maxkeysize, + (maxkeysize == 511) ? "default" : (maxkeysize == 0) ? "devel" + : "custom", + info.me_maxreaders); + print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", + info.me_last_txnid, info.me_tail_txnid, + info.me_last_txnid - info.me_tail_txnid); + + print(" - meta-1: %s %zu, %s", meta_synctype(info.me_meta1_sign), + info.me_meta1_txnid, meta_lt(info.me_meta1_txnid, info.me_meta1_sign, + info.me_meta2_txnid, info.me_meta2_sign) + ? "tail" + : "head"); + if (info.me_meta1_txnid > info.me_last_txnid) + print(", rolled-back %zu (%zu >>> %zu)", + info.me_meta1_txnid - info.me_last_txnid, info.me_meta1_txnid, + info.me_last_txnid); + print("\n"); + + print(" - meta-2: %s %zu, %s", meta_synctype(info.me_meta2_sign), + info.me_meta2_txnid, meta_lt(info.me_meta2_txnid, info.me_meta2_sign, + info.me_meta1_txnid, info.me_meta1_sign) + ? "tail" + : "head"); + if (info.me_meta2_txnid > info.me_last_txnid) + print(", rolled-back %zu (%zu >>> %zu)", + info.me_meta2_txnid - info.me_last_txnid, info.me_meta2_txnid, + info.me_last_txnid); + print("\n"); + } + + if (exclusive > 1) { + if (verbose) + print(" - perform full check last-txn-id with meta-pages\n"); + + if (!meta_lt(info.me_meta1_txnid, info.me_meta1_sign, info.me_meta2_txnid, + info.me_meta2_sign) && + info.me_meta1_txnid != info.me_last_txnid) { + print(" - meta-1 txn-id mismatch last-txn-id (%zi != %zi)\n", + info.me_meta1_txnid, info.me_last_txnid); + ++problems_meta; + } + + if (!meta_lt(info.me_meta2_txnid, info.me_meta2_sign, info.me_meta1_txnid, + info.me_meta1_sign) && + info.me_meta2_txnid != info.me_last_txnid) { + print(" - meta-2 txn-id mismatch last-txn-id (%zi != %zi)\n", + info.me_meta2_txnid, info.me_last_txnid); + ++problems_meta; + } + } else if (locktxn) { + if (verbose) + print(" - perform lite check last-txn-id with meta-pages (not a " + "monopolistic mode)\n"); + size_t last = (info.me_meta2_txnid > info.me_meta1_txnid) + ? info.me_meta2_txnid + : info.me_meta1_txnid; + if (last != info.me_last_txnid) { + print(" - last-meta mismatch last-txn-id (%zi != %zi)\n", last, + info.me_last_txnid); + ++problems_meta; + } + } else if (verbose) { + print(" - skip check last-txn-id with meta-pages (monopolistic or " + "write-lock mode only)\n"); + } + + if (!dont_traversal) { + struct problem *saved_list; + size_t traversal_problems; + size_t empty_pages, lost_bytes; + + print("Traversal b-tree...\n"); + fflush(NULL); + walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap)); + if (!walk.pagemap) { + rc = errno ? errno : ENOMEM; + error("calloc failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + saved_list = problems_push(); + rc = mdbx_env_pgwalk(txn, pgvisitor, NULL); + traversal_problems = problems_pop(saved_list); + + if (rc) { + if (rc == EINTR && gotsignal) { + print(" - interrupted by signal\n"); + fflush(NULL); + } else { + error("mdbx_env_pgwalk failed, error %d %s\n", rc, mdbx_strerror(rc)); + } + goto bailout; + } + + for (n = 0; n < lastpgno; ++n) + if (!walk.pagemap[n]) + walk.dbi_pages[0] += 1; + + empty_pages = lost_bytes = 0; + for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { + empty_pages += walk.dbi_empty_pages[i]; + lost_bytes += walk.dbi_lost_bytes[i]; + } + + if (verbose) { + size_t total_page_bytes = walk.pgcount * stat.ms_psize; + print(" - dbi pages: %zu total", walk.pgcount); + if (verbose > 1) + for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) + print(", %s %zu", walk.dbi_names[i], walk.dbi_pages[i]); + print(", %s %zu\n", walk.dbi_names[0], walk.dbi_pages[0]); + if (verbose > 1) { + print(" - space info: total %zu bytes, payload %zu (%.1f%%), unused " + "%zu (%.1f%%)\n", + total_page_bytes, walk.total_payload_bytes, + walk.total_payload_bytes * 100.0 / total_page_bytes, + total_page_bytes - walk.total_payload_bytes, + (total_page_bytes - walk.total_payload_bytes) * 100.0 / + total_page_bytes); + for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { + size_t dbi_bytes = walk.dbi_pages[i] * stat.ms_psize; + print(" %s: subtotal %zu bytes (%.1f%%), payload %zu (%.1f%%), " + "unused %zu (%.1f%%)", + walk.dbi_names[i], dbi_bytes, + dbi_bytes * 100.0 / total_page_bytes, walk.dbi_payload_bytes[i], + walk.dbi_payload_bytes[i] * 100.0 / dbi_bytes, + dbi_bytes - walk.dbi_payload_bytes[i], + (dbi_bytes - walk.dbi_payload_bytes[i]) * 100.0 / dbi_bytes); + if (walk.dbi_empty_pages[i]) + print(", %zu empty pages", walk.dbi_empty_pages[i]); + if (walk.dbi_lost_bytes[i]) + print(", %zu bytes lost", walk.dbi_lost_bytes[i]); + print("\n"); + } + } + print(" - summary: average fill %.1f%%", + walk.total_payload_bytes * 100.0 / total_page_bytes); + if (empty_pages) + print(", %zu empty pages", empty_pages); + if (lost_bytes) + print(", %zu bytes lost", lost_bytes); + print(", %zu problems\n", traversal_problems); + } + } else if (verbose) { + print("Skipping b-tree walk...\n"); + fflush(NULL); + } + + if (!verbose) + print("Iterating DBIs...\n"); + problems_maindb = process_db(-1, /* MAIN_DBI */ NULL, NULL, 0); + problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); + + if (verbose) { + size_t value = info.me_mapsize / stat.ms_psize; + double percent = value / 100.0; + print(" - pages info: %zu total", value); + print(", allocated %zu (%.1f%%)", lastpgno, lastpgno / percent); + + if (verbose > 1) { + value = info.me_mapsize / stat.ms_psize - lastpgno; + print(", remained %zu (%.1f%%)", value, value / percent); + + value = lastpgno - freedb_pages; + print(", used %zu (%.1f%%)", value, value / percent); + + print(", gc %zu (%.1f%%)", freedb_pages, freedb_pages / percent); + + value = freedb_pages - reclaimable_pages; + print(", detained %zu (%.1f%%)", value, value / percent); + + print(", reclaimable %zu (%.1f%%)", reclaimable_pages, + reclaimable_pages / percent); + } + + value = info.me_mapsize / stat.ms_psize - lastpgno + reclaimable_pages; + print(", available %zu (%.1f%%)\n", value, value / percent); + } + + if (problems_maindb == 0 && problems_freedb == 0) { + if (!dont_traversal && (exclusive || locktxn)) { + if (walk.pgcount != lastpgno - freedb_pages) { + error("used pages mismatch (%zu != %zu)\n", walk.pgcount, + lastpgno - freedb_pages); + } + if (walk.dbi_pages[0] != freedb_pages) { + error("gc pages mismatch (%zu != %zu)\n", walk.dbi_pages[0], + freedb_pages); + } + } else if (verbose) { + print(" - skip check used and gc pages (btree-traversal with " + "monopolistic or write-lock mode only)\n"); + } + + if (!process_db(-1, NULL, handle_maindb, 1)) { + if (!userdb_count && verbose) + print(" - does not contain multiple databases\n"); + } + } + +bailout: + if (txn) + mdbx_txn_abort(txn); + if (locktxn) + mdbx_txn_abort(locktxn); + if (env) + mdbx_env_close(env); + fflush(NULL); + if (rc) { + if (rc < 0) + return gotsignal ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; + return EXIT_FAILURE_MDB; + } + + if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { + rc = errno; + error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); + return EXIT_FAILURE_SYS; + } + + elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec + + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; + + total_problems += problems_meta; + if (total_problems || problems_maindb || problems_freedb) { + print("Total %zu error(s) is detected, elapsed %.3f seconds.\n", + total_problems, elapsed); + if (problems_meta || problems_maindb || problems_freedb) + return EXIT_FAILURE_CHECK_MAJOR; + return EXIT_FAILURE_CHECK_MINOR; + } + print("No error is detected, elapsed %.3f seconds\n", elapsed); + return EXIT_SUCCESS; +} diff --git a/mdb_copy.1 b/mdbx_copy.1 similarity index 94% rename from mdb_copy.1 rename to mdbx_copy.1 index 157e741d..06a620fd 100644 --- a/mdb_copy.1 +++ b/mdbx_copy.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_COPY 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_copy \- LMDB environment copy tool +mdbx_copy \- LMDB environment copy tool .SH SYNOPSIS -.B mdb_copy +.B mdbx_copy [\c .BR \-V ] [\c @@ -18,7 +18,7 @@ mdb_copy \- LMDB environment copy tool .BR dstpath ] .SH DESCRIPTION The -.B mdb_copy +.B mdbx_copy utility copies an LMDB environment. The environment can be copied regardless of whether it is currently in use. No lockfile is created, since it gets recreated at need. @@ -52,6 +52,6 @@ This utility can trigger significant file size growth if run in parallel with write transactions, because pages which they free during copying cannot be reused until the copy is done. .SH "SEE ALSO" -.BR mdb_stat (1) +.BR mdbx_stat (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_copy.c b/mdbx_copy.c new file mode 100644 index 00000000..b80b70a5 --- /dev/null +++ b/mdbx_copy.c @@ -0,0 +1,76 @@ +/* mdbx_copy.c - memory-mapped database backup tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2012-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include + +static void sighandle(int sig) { (void)sig; } + +int main(int argc, char *argv[]) { + int rc; + MDB_env *env = NULL; + const char *progname = argv[0], *act; + unsigned flags = MDB_RDONLY; + unsigned cpflags = 0; + + for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { + if (argv[1][1] == 'n' && argv[1][2] == '\0') + flags |= MDB_NOSUBDIR; + else if (argv[1][1] == 'c' && argv[1][2] == '\0') + cpflags |= MDB_CP_COMPACT; + else if (argv[1][1] == 'V' && argv[1][2] == '\0') { + printf("%s\n", MDB_VERSION_STRING); + exit(0); + } else + argc = 0; + } + + if (argc < 2 || argc > 3) { + fprintf(stderr, "usage: %s [-V] [-c] [-n] srcpath [dstpath]\n", progname); + exit(EXIT_FAILURE); + } + +#ifdef SIGPIPE + signal(SIGPIPE, sighandle); +#endif +#ifdef SIGHUP + signal(SIGHUP, sighandle); +#endif + signal(SIGINT, sighandle); + signal(SIGTERM, sighandle); + + act = "opening environment"; + rc = mdbx_env_create(&env); + if (rc == MDB_SUCCESS) { + rc = mdbx_env_open(env, argv[1], flags, 0640); + } + if (rc == MDB_SUCCESS) { + act = "copying"; + if (argc == 2) + rc = mdbx_env_copyfd2(env, STDOUT_FILENO, cpflags); + else + rc = mdbx_env_copy2(env, argv[2], cpflags); + } + if (rc) + fprintf(stderr, "%s: %s failed, error %d (%s)\n", progname, act, rc, + mdbx_strerror(rc)); + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/mdb_dump.1 b/mdbx_dump.1 similarity index 94% rename from mdb_dump.1 rename to mdbx_dump.1 index 4c4553ce..80718bb0 100644 --- a/mdb_dump.1 +++ b/mdbx_dump.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_DUMP 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_dump \- LMDB environment export tool +mdbx_dump \- LMDB environment export tool .SH SYNOPSIS -.B mdb_dump +.B mdbx_dump [\c .BR \-V ] [\c @@ -23,11 +23,11 @@ mdb_dump \- LMDB environment export tool .BR \ envpath .SH DESCRIPTION The -.B mdb_dump +.B mdbx_dump utility reads a database and writes its contents to the standard output using a portable flat-text format understood by the -.BR mdb_load (1) +.BR mdbx_load (1) utility. .SH OPTIONS .TP @@ -69,9 +69,9 @@ will result in new databases that use the default comparison functions. damaged beyond repair permitting neither record storage nor retrieval.\fP The only available workaround is to modify the source for the -.BR mdb_load (1) +.BR mdbx_load (1) utility to load the database using the correct comparison functions. .SH "SEE ALSO" -.BR mdb_load (1) +.BR mdbx_load (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_dump.c b/mdbx_dump.c new file mode 100644 index 00000000..16543d09 --- /dev/null +++ b/mdbx_dump.c @@ -0,0 +1,316 @@ +/* mdbx_dump.c - memory-mapped database dump tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2011-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include +#include +#include +#include +#include + +#define PRINT 1 +static int mode; + +typedef struct flagbit { + int bit; + char *name; +} flagbit; + +flagbit dbflags[] = {{MDB_REVERSEKEY, "reversekey"}, + {MDB_DUPSORT, "dupsort"}, + {MDB_INTEGERKEY, "integerkey"}, + {MDB_DUPFIXED, "dupfixed"}, + {MDB_INTEGERDUP, "integerdup"}, + {MDB_REVERSEDUP, "reversedup"}, + {0, NULL}}; + +static volatile sig_atomic_t gotsig; + +static void dumpsig(int sig) { + (void)sig; + gotsig = 1; +} + +static const char hexc[] = "0123456789abcdef"; + +static void hex(unsigned char c) { + putchar(hexc[c >> 4]); + putchar(hexc[c & 0xf]); +} + +static void text(MDB_val *v) { + unsigned char *c, *end; + + putchar(' '); + c = v->mv_data; + end = c + v->mv_size; + while (c < end) { + if (isprint(*c)) { + putchar(*c); + } else { + putchar('\\'); + hex(*c); + } + c++; + } + putchar('\n'); +} + +static void byte(MDB_val *v) { + unsigned char *c, *end; + + putchar(' '); + c = v->mv_data; + end = c + v->mv_size; + while (c < end) { + hex(*c++); + } + putchar('\n'); +} + +/* Dump in BDB-compatible format */ +static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) { + MDB_cursor *mc; + MDBX_stat ms; + MDB_val key, data; + MDBX_envinfo info; + unsigned int flags; + int rc, i; + + rc = mdbx_dbi_flags(txn, dbi, &flags); + if (rc) + return rc; + + rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); + if (rc) + return rc; + + rc = mdbx_env_info(mdbx_txn_env(txn), &info, sizeof(info)); + if (rc) + return rc; + + printf("VERSION=3\n"); + printf("format=%s\n", mode & PRINT ? "print" : "bytevalue"); + if (name) + printf("database=%s\n", name); + printf("type=btree\n"); + printf("mapsize=%zu\n", info.me_mapsize); + if (info.me_mapaddr) + printf("mapaddr=%p\n", info.me_mapaddr); + printf("maxreaders=%u\n", info.me_maxreaders); + + for (i = 0; dbflags[i].bit; i++) + if (flags & dbflags[i].bit) + printf("%s=1\n", dbflags[i].name); + + printf("db_pagesize=%d\n", ms.ms_psize); + printf("HEADER=END\n"); + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) + return rc; + + while ((rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT)) == MDB_SUCCESS) { + if (gotsig) { + rc = EINTR; + break; + } + if (mode & PRINT) { + text(&key); + text(&data); + } else { + byte(&key); + byte(&data); + } + } + printf("DATA=END\n"); + if (rc == MDB_NOTFOUND) + rc = MDB_SUCCESS; + + return rc; +} + +static void usage(char *prog) { + fprintf(stderr, + "usage: %s [-V] [-f output] [-l] [-n] [-p] [-a|-s subdb] dbpath\n", + prog); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) { + int i, rc; + MDB_env *env; + MDB_txn *txn; + MDB_dbi dbi; + char *prog = argv[0]; + char *envname; + char *subname = NULL; + int alldbs = 0, envflags = 0, list = 0; + + if (argc < 2) { + usage(prog); + } + + /* -a: dump main DB and all subDBs + * -s: dump only the named subDB + * -n: use NOSUBDIR flag on env_open + * -p: use printable characters + * -f: write to file instead of stdout + * -V: print version and exit + * (default) dump only the main DB + */ + while ((i = getopt(argc, argv, "af:lnps:V")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(0); + break; + case 'l': + list = 1; + /*FALLTHROUGH*/; + case 'a': + if (subname) + usage(prog); + alldbs++; + break; + case 'f': + if (freopen(optarg, "w", stdout) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", prog, optarg, strerror(errno)); + exit(EXIT_FAILURE); + } + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 'p': + mode |= PRINT; + break; + case 's': + if (alldbs) + usage(prog); + subname = optarg; + break; + default: + usage(prog); + } + } + + if (optind != argc - 1) + usage(prog); + +#ifdef SIGPIPE + signal(SIGPIPE, dumpsig); +#endif +#ifdef SIGHUP + signal(SIGHUP, dumpsig); +#endif + signal(SIGINT, dumpsig); + signal(SIGTERM, dumpsig); + + envname = argv[optind]; + rc = mdbx_env_create(&env); + if (rc) { + fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, + mdbx_strerror(rc)); + return EXIT_FAILURE; + } + + if (alldbs || subname) { + mdbx_env_set_maxdbs(env, 2); + } + + rc = mdbx_env_open(env, envname, envflags | MDB_RDONLY, 0664); + if (rc) { + fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + rc = mdbx_dbi_open(txn, subname, 0, &dbi); + if (rc) { + fprintf(stderr, "mdbx_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + + if (alldbs) { + MDB_cursor *cursor; + MDB_val key; + int count = 0; + + rc = mdbx_cursor_open(txn, dbi, &cursor); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { + char *str; + MDB_dbi db2; + if (memchr(key.mv_data, '\0', key.mv_size)) + continue; + count++; + str = malloc(key.mv_size + 1); + memcpy(str, key.mv_data, key.mv_size); + str[key.mv_size] = '\0'; + rc = mdbx_dbi_open(txn, str, 0, &db2); + if (rc == MDB_SUCCESS) { + if (list) { + printf("%s\n", str); + list++; + } else { + rc = dumpit(txn, db2, str); + if (rc) + break; + } + mdbx_dbi_close(env, db2); + } + free(str); + if (rc) + continue; + } + mdbx_cursor_close(cursor); + if (!count) { + fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, + envname); + rc = MDB_NOTFOUND; + } else if (rc == MDB_INCOMPATIBLE) { + /* LY: the record it not a named sub-db. */ + rc = MDB_SUCCESS; + } + } else { + rc = dumpit(txn, dbi, subname); + } + if (rc && rc != MDB_NOTFOUND) + fprintf(stderr, "%s: %s: %s\n", prog, envname, mdbx_strerror(rc)); + + mdbx_dbi_close(env, dbi); +txn_abort: + mdbx_txn_abort(txn); +env_close: + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/mdb_load.1 b/mdbx_load.1 similarity index 94% rename from mdb_load.1 rename to mdbx_load.1 index 5e082f67..63b88f10 100644 --- a/mdb_load.1 +++ b/mdbx_load.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_LOAD 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_load \- LMDB environment import tool +mdbx_load \- LMDB environment import tool .SH SYNOPSIS -.B mdb_load +.B mdbx_load [\c .BR \-V ] [\c @@ -22,15 +22,15 @@ mdb_load \- LMDB environment import tool .BR \ envpath .SH DESCRIPTION The -.B mdb_load +.B mdbx_load utility reads from the standard input and loads it into the LMDB environment .BR envpath . The input to -.B mdb_load +.B mdbx_load must be in the output format specified by the -.BR mdb_dump (1) +.BR mdbx_dump (1) utility or as specified by the .B -T option below. @@ -66,7 +66,7 @@ character; for example, \\0a is a newline character in the ASCII character set. For this reason, any backslash or newline characters that naturally occur in the text input must be escaped to avoid misinterpretation by -.BR mdb_load . +.BR mdbx_load . .SH DIAGNOSTICS Exit status is zero if no errors occur. @@ -74,6 +74,6 @@ Errors result in a non-zero exit status and a diagnostic message being written to standard error. .SH "SEE ALSO" -.BR mdb_dump (1) +.BR mdbx_dump (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_load.c b/mdbx_load.c new file mode 100644 index 00000000..a211b24e --- /dev/null +++ b/mdbx_load.c @@ -0,0 +1,466 @@ +/* mdbx_load.c - memory-mapped database load tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2011-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include +#include +#include +#include + +#define PRINT 1 +#define NOHDR 2 +static int mode; + +static char *subname = NULL; + +static size_t lineno; +static int version; + +static int dbi_flags; + +static char *prog; + +static int Eof; + +static MDBX_envinfo info; + +static MDB_val kbuf, dbuf; + +#define STRLENOF(s) (sizeof(s) - 1) + +typedef struct flagbit { + int bit; + char *name; + int len; +} flagbit; + +#define S(s) s, STRLENOF(s) + +flagbit dbflags[] = {{MDB_REVERSEKEY, S("reversekey")}, + {MDB_DUPSORT, S("dupsort")}, + {MDB_INTEGERKEY, S("integerkey")}, + {MDB_DUPFIXED, S("dupfixed")}, + {MDB_INTEGERDUP, S("integerdup")}, + {MDB_REVERSEDUP, S("reversedup")}, + {0, NULL, 0}}; + +static void readhdr(void) { + char *ptr; + + dbi_flags = 0; + while (fgets(dbuf.mv_data, dbuf.mv_size, stdin) != NULL) { + lineno++; + if (!strncmp(dbuf.mv_data, "db_pagesize=", STRLENOF("db_pagesize=")) || + !strncmp(dbuf.mv_data, "duplicates=", STRLENOF("duplicates="))) { + /* LY: silently ignore information fields. */ + continue; + } else if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { + version = atoi((char *)dbuf.mv_data + STRLENOF("VERSION=")); + if (version > 3) { + fprintf(stderr, "%s: line %zd: unsupported VERSION %d\n", prog, lineno, + version); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "HEADER=END", STRLENOF("HEADER=END"))) { + break; + } else if (!strncmp(dbuf.mv_data, "format=", STRLENOF("format="))) { + if (!strncmp((char *)dbuf.mv_data + STRLENOF("FORMAT="), "print", + STRLENOF("print"))) + mode |= PRINT; + else if (strncmp((char *)dbuf.mv_data + STRLENOF("FORMAT="), "bytevalue", + STRLENOF("bytevalue"))) { + fprintf(stderr, "%s: line %zd: unsupported FORMAT %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("FORMAT=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "database=", STRLENOF("database="))) { + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + if (subname) + free(subname); + subname = strdup((char *)dbuf.mv_data + STRLENOF("database=")); + } else if (!strncmp(dbuf.mv_data, "type=", STRLENOF("type="))) { + if (strncmp((char *)dbuf.mv_data + STRLENOF("type="), "btree", + STRLENOF("btree"))) { + fprintf(stderr, "%s: line %zd: unsupported type %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("type=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "mapaddr=", STRLENOF("mapaddr="))) { + int i; + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + i = sscanf((char *)dbuf.mv_data + STRLENOF("mapaddr="), "%p", + &info.me_mapaddr); + if (i != 1) { + fprintf(stderr, "%s: line %zd: invalid mapaddr %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("mapaddr=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "mapsize=", STRLENOF("mapsize="))) { + int i; + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%zu", + &info.me_mapsize); + if (i != 1) { + fprintf(stderr, "%s: line %zd: invalid mapsize %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("mapsize=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "maxreaders=", STRLENOF("maxreaders="))) { + int i; + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + i = sscanf((char *)dbuf.mv_data + STRLENOF("maxreaders="), "%u", + &info.me_maxreaders); + if (i != 1) { + fprintf(stderr, "%s: line %zd: invalid maxreaders %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("maxreaders=")); + exit(EXIT_FAILURE); + } + } else { + int i; + for (i = 0; dbflags[i].bit; i++) { + if (!strncmp(dbuf.mv_data, dbflags[i].name, dbflags[i].len) && + ((char *)dbuf.mv_data)[dbflags[i].len] == '=') { + if (((char *)dbuf.mv_data)[dbflags[i].len + 1] == '1') + dbi_flags |= dbflags[i].bit; + break; + } + } + if (!dbflags[i].bit) { + ptr = memchr(dbuf.mv_data, '=', dbuf.mv_size); + if (!ptr) { + fprintf(stderr, "%s: line %zd: unexpected format\n", prog, lineno); + exit(EXIT_FAILURE); + } else { + *ptr = '\0'; + fprintf(stderr, "%s: line %zd: unrecognized keyword ignored: %s\n", + prog, lineno, (char *)dbuf.mv_data); + } + } + } + } +} + +static void badend(void) { + fprintf(stderr, "%s: line %zd: unexpected end of input\n", prog, lineno); +} + +static int unhex(unsigned char *c2) { + int x, c; + x = *c2++ & 0x4f; + if (x & 0x40) + x -= 55; + c = x << 4; + x = *c2 & 0x4f; + if (x & 0x40) + x -= 55; + c |= x; + return c; +} + +static int readline(MDB_val *out, MDB_val *buf) { + unsigned char *c1, *c2, *end; + size_t len, l2; + int c; + + if (!(mode & NOHDR)) { + c = fgetc(stdin); + if (c == EOF) { + Eof = 1; + return EOF; + } + if (c != ' ') { + lineno++; + if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { + badend: + Eof = 1; + badend(); + return EOF; + } + if (c == 'D' && !strncmp(buf->mv_data, "ATA=END", STRLENOF("ATA=END"))) + return EOF; + goto badend; + } + } + if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { + Eof = 1; + return EOF; + } + lineno++; + + c1 = buf->mv_data; + len = strlen((char *)c1); + l2 = len; + + /* Is buffer too short? */ + while (c1[len - 1] != '\n') { + buf->mv_data = realloc(buf->mv_data, buf->mv_size * 2); + if (!buf->mv_data) { + Eof = 1; + fprintf(stderr, "%s: line %zd: out of memory, line too long\n", prog, + lineno); + return EOF; + } + c1 = buf->mv_data; + c1 += l2; + if (fgets((char *)c1, buf->mv_size + 1, stdin) == NULL) { + Eof = 1; + badend(); + return EOF; + } + buf->mv_size *= 2; + len = strlen((char *)c1); + l2 += len; + } + c1 = c2 = buf->mv_data; + len = l2; + c1[--len] = '\0'; + end = c1 + len; + + if (mode & PRINT) { + while (c2 < end) { + if (*c2 == '\\') { + if (c2[1] == '\\') { + c1++; + c2 += 2; + } else { + if (c2 + 3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) { + Eof = 1; + badend(); + return EOF; + } + *c1++ = unhex(++c2); + c2 += 2; + } + } else { + /* copies are redundant when no escapes were used */ + *c1++ = *c2++; + } + } + } else { + /* odd length not allowed */ + if (len & 1) { + Eof = 1; + badend(); + return EOF; + } + while (c2 < end) { + if (!isxdigit(*c2) || !isxdigit(c2[1])) { + Eof = 1; + badend(); + return EOF; + } + *c1++ = unhex(c2); + c2 += 2; + } + } + c2 = out->mv_data = buf->mv_data; + out->mv_size = c1 - c2; + + return 0; +} + +static void usage(void) { + fprintf(stderr, "usage: %s [-V] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", + prog); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) { + int i, rc; + MDB_env *env; + MDB_txn *txn; + MDB_cursor *mc; + MDB_dbi dbi; + char *envname; + int envflags = 0, putflags = 0; + + prog = argv[0]; + + if (argc < 2) { + usage(); + } + + /* -f: load file instead of stdin + * -n: use NOSUBDIR flag on env_open + * -s: load into named subDB + * -N: use NOOVERWRITE on puts + * -T: read plaintext + * -V: print version and exit + */ + while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(0); + break; + case 'f': + if (freopen(optarg, "r", stdin) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", prog, optarg, strerror(errno)); + exit(EXIT_FAILURE); + } + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 's': + subname = strdup(optarg); + break; + case 'N': + putflags = MDB_NOOVERWRITE | MDB_NODUPDATA; + break; + case 'T': + mode |= NOHDR | PRINT; + break; + default: + usage(); + } + } + + if (optind != argc - 1) + usage(); + + dbuf.mv_size = 4096; + dbuf.mv_data = malloc(dbuf.mv_size); + + if (!(mode & NOHDR)) + readhdr(); + + envname = argv[optind]; + rc = mdbx_env_create(&env); + if (rc) { + fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, + mdbx_strerror(rc)); + return EXIT_FAILURE; + } + + mdbx_env_set_maxdbs(env, 2); + + if (info.me_maxreaders) + mdbx_env_set_maxreaders(env, info.me_maxreaders); + + if (info.me_mapsize) + mdbx_env_set_mapsize(env, info.me_mapsize); + + if (info.me_mapaddr) + envflags |= MDB_FIXEDMAP; + + rc = mdbx_env_open(env, envname, envflags, 0664); + if (rc) { + fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + kbuf.mv_size = mdbx_env_get_maxkeysize(env) * 2 + 2; + kbuf.mv_data = malloc(kbuf.mv_size); + + while (!Eof) { + MDB_val key, data; + int batch = 0; + + rc = mdbx_txn_begin(env, NULL, 0, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + rc = mdbx_dbi_open(txn, subname, dbi_flags | MDB_CREATE, &dbi); + if (rc) { + fprintf(stderr, "mdbx_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + + while (1) { + rc = readline(&key, &kbuf); + if (rc) /* rc == EOF */ + break; + + rc = readline(&data, &dbuf); + if (rc) { + fprintf(stderr, "%s: line %zd: failed to read key value\n", prog, + lineno); + goto txn_abort; + } + + rc = mdbx_cursor_put(mc, &key, &data, putflags); + if (rc == MDB_KEYEXIST && putflags) + continue; + if (rc) { + fprintf(stderr, "mdbx_cursor_put failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + batch++; + if (batch == 100) { + rc = mdbx_txn_commit(txn); + if (rc) { + fprintf(stderr, "%s: line %zd: txn_commit: %s\n", prog, lineno, + mdbx_strerror(rc)); + goto env_close; + } + rc = mdbx_txn_begin(env, NULL, 0, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + batch = 0; + } + } + rc = mdbx_txn_commit(txn); + txn = NULL; + if (rc) { + fprintf(stderr, "%s: line %zd: txn_commit: %s\n", prog, lineno, + mdbx_strerror(rc)); + goto env_close; + } + mdbx_dbi_close(env, dbi); + if (!(mode & NOHDR)) + readhdr(); + } + +txn_abort: + mdbx_txn_abort(txn); +env_close: + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/mdb_stat.1 b/mdbx_stat.1 similarity index 95% rename from mdb_stat.1 rename to mdbx_stat.1 index bb659744..096fffc1 100644 --- a/mdb_stat.1 +++ b/mdbx_stat.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_STAT 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_stat \- LMDB environment status tool +mdbx_stat \- LMDB environment status tool .SH SYNOPSIS -.B mdb_stat +.B mdbx_stat [\c .BR \-V ] [\c @@ -23,7 +23,7 @@ mdb_stat \- LMDB environment status tool .BR \ envpath .SH DESCRIPTION The -.B mdb_stat +.B mdbx_stat utility displays the status of an LMDB environment. .SH OPTIONS .TP @@ -61,6 +61,6 @@ Exit status is zero if no errors occur. Errors result in a non-zero exit status and a diagnostic message being written to standard error. .SH "SEE ALSO" -.BR mdb_copy (1) +.BR mdbx_copy (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_stat.c b/mdbx_stat.c new file mode 100644 index 00000000..ca72b290 --- /dev/null +++ b/mdbx_stat.c @@ -0,0 +1,306 @@ +/* mdbx_stat.c - memory-mapped database status tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2011-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include +#include + +static void prstat(MDBX_stat *ms) { + printf(" Page size: %u\n", ms->ms_psize); + printf(" Tree depth: %u\n", ms->ms_depth); + printf(" Branch pages: %zu\n", ms->ms_branch_pages); + printf(" Leaf pages: %zu\n", ms->ms_leaf_pages); + printf(" Overflow pages: %zu\n", ms->ms_overflow_pages); + printf(" Entries: %zu\n", ms->ms_entries); +} + +static void usage(char *prog) { + fprintf(stderr, + "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-a|-s subdb] dbpath\n", + prog); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) { + int i, rc; + MDB_env *env; + MDB_txn *txn; + MDB_dbi dbi; + MDBX_stat mst; + MDBX_envinfo mei; + char *prog = argv[0]; + char *envname; + char *subname = NULL; + int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0, rdrinfo = 0; + + if (argc < 2) { + usage(prog); + } + + /* -a: print stat of main DB and all subDBs + * -s: print stat of only the named subDB + * -e: print env info + * -f: print freelist info + * -r: print reader info + * -n: use NOSUBDIR flag on env_open + * -V: print version and exit + * (default) print stat of only the main DB + */ + while ((i = getopt(argc, argv, "Vaefnrs:")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(0); + break; + case 'a': + if (subname) + usage(prog); + alldbs++; + break; + case 'e': + envinfo++; + break; + case 'f': + freinfo++; + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 'r': + rdrinfo++; + break; + case 's': + if (alldbs) + usage(prog); + subname = optarg; + break; + default: + usage(prog); + } + } + + if (optind != argc - 1) + usage(prog); + + envname = argv[optind]; + rc = mdbx_env_create(&env); + if (rc) { + fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, + mdbx_strerror(rc)); + return EXIT_FAILURE; + } + + if (alldbs || subname) { + mdbx_env_set_maxdbs(env, 4); + } + + rc = mdbx_env_open(env, envname, envflags | MDB_RDONLY, 0664); + if (rc) { + fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + if (envinfo) { + (void)mdbx_env_stat(env, &mst, sizeof(mst)); + (void)mdbx_env_info(env, &mei, sizeof(mei)); + printf("Environment Info\n"); + printf(" Map address: %p\n", mei.me_mapaddr); + printf(" Map size: %zu\n", mei.me_mapsize); + printf(" Page size: %u\n", mst.ms_psize); + printf(" Max pages: %zu\n", mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %zu\n", mei.me_last_pgno + 1); + printf(" Last transaction ID: %zu\n", mei.me_last_txnid); + printf(" Tail transaction ID: %zu (%zi)\n", mei.me_tail_txnid, + mei.me_tail_txnid - mei.me_last_txnid); + printf(" Max readers: %u\n", mei.me_maxreaders); + printf(" Number of readers used: %u\n", mei.me_numreaders); + } else { + /* LY: zap warnings from gcc */ + memset(&mst, 0, sizeof(mst)); + memset(&mei, 0, sizeof(mei)); + } + + if (rdrinfo) { + printf("Reader Table Status\n"); + rc = mdbx_reader_list(env, (MDB_msg_func *)fputs, stdout); + if (rdrinfo > 1) { + int dead; + mdbx_reader_check(env, &dead); + printf(" %d stale readers cleared.\n", dead); + rc = mdbx_reader_list(env, (MDB_msg_func *)fputs, stdout); + } + if (!(subname || alldbs || freinfo)) + goto env_close; + } + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + if (freinfo) { + MDB_cursor *cursor; + MDB_val key, data; + size_t pages = 0, *iptr; + size_t reclaimable = 0; + + printf("Freelist Status\n"); + dbi = 0; + rc = mdbx_cursor_open(txn, dbi, &cursor); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); + if (rc) { + fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + prstat(&mst); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + iptr = data.mv_data; + pages += *iptr; + if (envinfo && mei.me_tail_txnid > *(size_t *)key.mv_data) + reclaimable += *iptr; + if (freinfo > 1) { + char *bad = ""; + size_t pg, prev; + ssize_t i, j, span = 0; + j = *iptr++; + for (i = j, prev = 1; --i >= 0;) { + pg = iptr[i]; + if (pg <= prev) + bad = " [bad sequence]"; + prev = pg; + pg += span; + for (; i >= span && iptr[i - span] == pg; span++, pg++) + ; + } + printf(" Transaction %zu, %zd pages, maxspan %zd%s\n", + *(size_t *)key.mv_data, j, span, bad); + if (freinfo > 2) { + for (--j; j >= 0;) { + pg = iptr[j]; + for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) + ; + if (span > 1) + printf(" %9zu[%zd]\n", pg, span); + else + printf(" %9zu\n", pg); + } + } + } + } + mdbx_cursor_close(cursor); + if (envinfo) { + size_t value = mei.me_mapsize / mst.ms_psize; + double percent = value / 100.0; + printf("Page Allocation Info\n"); + printf(" Max pages: %9zu 100%%\n", value); + + value = mei.me_last_pgno + 1; + printf(" Number of pages used: %zu %.1f%%\n", value, value / percent); + + value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1); + printf(" Remained: %zu %.1f%%\n", value, value / percent); + + value = mei.me_last_pgno + 1 - pages; + printf(" Used now: %zu %.1f%%\n", value, value / percent); + + value = pages; + printf(" Unallocated: %zu %.1f%%\n", value, value / percent); + + value = pages - reclaimable; + printf(" Detained: %zu %.1f%%\n", value, value / percent); + + value = reclaimable; + printf(" Reclaimable: %zu %.1f%%\n", value, value / percent); + + value = + mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1) + reclaimable; + printf(" Available: %zu %.1f%%\n", value, value / percent); + } else + printf(" Free pages: %zu\n", pages); + } + + rc = mdbx_dbi_open(txn, subname, 0, &dbi); + if (rc) { + fprintf(stderr, "mdbx_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + + rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); + if (rc) { + fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + printf("Status of %s\n", subname ? subname : "Main DB"); + prstat(&mst); + + if (alldbs) { + MDB_cursor *cursor; + MDB_val key; + + rc = mdbx_cursor_open(txn, dbi, &cursor); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { + char *str; + MDB_dbi db2; + if (memchr(key.mv_data, '\0', key.mv_size)) + continue; + str = malloc(key.mv_size + 1); + memcpy(str, key.mv_data, key.mv_size); + str[key.mv_size] = '\0'; + rc = mdbx_dbi_open(txn, str, 0, &db2); + if (rc == MDB_SUCCESS) + printf("Status of %s\n", str); + free(str); + if (rc) + continue; + rc = mdbx_stat(txn, db2, &mst, sizeof(mst)); + if (rc) { + fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + prstat(&mst); + mdbx_dbi_close(env, db2); + } + mdbx_cursor_close(cursor); + } + + if (rc == MDB_NOTFOUND) + rc = MDB_SUCCESS; + + mdbx_dbi_close(env, dbi); +txn_abort: + mdbx_txn_abort(txn); +env_close: + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/midl.c b/midl.c deleted file mode 100644 index 6d2417ac..00000000 --- a/midl.c +++ /dev/null @@ -1,361 +0,0 @@ -/** @file midl.c - * @brief ldap bdb back-end ID List functions */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2000-2017 The OpenLDAP Foundation. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include "midl.h" - -/** @defgroup internal LMDB Internals - * @{ - */ -/** @defgroup idls ID List Management - * @{ - */ - -static unsigned __hot -mdb_midl_search( MDB_IDL ids, MDB_ID id ) -{ - /* - * binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id - */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = ids[0]; - - while( 0 < n ) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = mdbx_cmp2int( ids[cursor], id ); - - if( val < 0 ) { - n = pivot; - - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - - } else { - return cursor; - } - } - - if( val > 0 ) { - ++cursor; - } - return cursor; -} - -#if 0 /* superseded by append/sort */ -static int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) -{ - unsigned x, i; - - x = mdb_midl_search( ids, id ); - assert( x > 0 ); - - if( x < 1 ) { - /* internal error */ - return -2; - } - - if ( x <= ids[0] && ids[x] == id ) { - /* duplicate */ - assert(0); - return -1; - } - - if ( ++ids[0] >= MDB_IDL_DB_MAX ) { - /* no room */ - --ids[0]; - return -2; - - } else { - /* insert id */ - for (i=ids[0]; i>x; i--) - ids[i] = ids[i-1]; - ids[x] = id; - } - - return 0; -} -#endif - -static MDB_IDL mdb_midl_alloc(int num) -{ - MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); - if (ids) { - *ids++ = num; - *ids = 0; - } - return ids; -} - -static void mdb_midl_free(MDB_IDL ids) -{ - if (ids) - free(ids-1); -} - -static void mdb_midl_shrink( MDB_IDL *idp ) -{ - MDB_IDL ids = *idp; - if (*(--ids) > MDB_IDL_UM_MAX && - (ids = realloc(ids, (MDB_IDL_UM_MAX+2) * sizeof(MDB_ID)))) - { - *ids++ = MDB_IDL_UM_MAX; - *idp = ids; - } -} - -static int mdb_midl_grow( MDB_IDL *idp, int num ) -{ - MDB_IDL idn = *idp-1; - /* grow it */ - idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); - if (!idn) - return ENOMEM; - *idn++ += num; - *idp = idn; - return 0; -} - -static int mdb_midl_need( MDB_IDL *idp, unsigned num ) -{ - MDB_IDL ids = *idp; - num += ids[0]; - if (num > ids[-1]) { - num = (num + num/4 + (256 + 2)) & -256; - if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) - return ENOMEM; - *ids++ = num - 2; - *idp = ids; - } - return 0; -} - -static int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) -{ - MDB_IDL ids = *idp; - /* Too big? */ - if (ids[0] >= ids[-1]) { - if (mdb_midl_grow(idp, MDB_IDL_UM_MAX)) - return ENOMEM; - ids = *idp; - } - ids[0]++; - ids[ids[0]] = id; - return 0; -} - -static int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) -{ - MDB_IDL ids = *idp; - /* Too big? */ - if (ids[0] + app[0] >= ids[-1]) { - if (mdb_midl_grow(idp, app[0])) - return ENOMEM; - ids = *idp; - } - memcpy(&ids[ids[0]+1], &app[1], app[0] * sizeof(MDB_ID)); - ids[0] += app[0]; - return 0; -} - -static int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) -{ - MDB_ID *ids = *idp, len = ids[0]; - /* Too big? */ - if (len + n > ids[-1]) { - if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) - return ENOMEM; - ids = *idp; - } - ids[0] = len + n; - ids += len; - while (n) - ids[n--] = id++; - return 0; -} - -static void __hot -mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ) -{ - MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k; - idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ - old_id = idl[j]; - while (i) { - merge_id = merge[i--]; - for (; old_id < merge_id; old_id = idl[--j]) - idl[k--] = old_id; - idl[k--] = merge_id; - } - idl[0] = total; -} - -/* Quicksort + Insertion sort for small arrays */ - -#define SMALL 8 -#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } - -static void __hot -mdb_midl_sort( MDB_IDL ids ) -{ - /* Max possible depth of int-indexed tree * 2 items/level */ - int istack[sizeof(int)*CHAR_BIT * 2]; - int i,j,k,l,ir,jstack; - MDB_ID a, itmp; - - ir = (int)ids[0]; - l = 1; - jstack = 0; - for(;;) { - if (ir - l < SMALL) { /* Insertion sort */ - for (j=l+1;j<=ir;j++) { - a = ids[j]; - for (i=j-1;i>=1;i--) { - if (ids[i] >= a) break; - ids[i+1] = ids[i]; - } - ids[i+1] = a; - } - if (jstack == 0) break; - ir = istack[jstack--]; - l = istack[jstack--]; - } else { - k = (l + ir) >> 1; /* Choose median of left, center, right */ - MIDL_SWAP(ids[k], ids[l+1]); - if (ids[l] < ids[ir]) { - MIDL_SWAP(ids[l], ids[ir]); - } - if (ids[l+1] < ids[ir]) { - MIDL_SWAP(ids[l+1], ids[ir]); - } - if (ids[l] < ids[l+1]) { - MIDL_SWAP(ids[l], ids[l+1]); - } - i = l+1; - j = ir; - a = ids[l+1]; - for(;;) { - do i++; while(ids[i] > a); - do j--; while(ids[j] < a); - if (j < i) break; - MIDL_SWAP(ids[i],ids[j]); - } - ids[l+1] = ids[j]; - ids[j] = a; - jstack += 2; - if (ir-i+1 >= j-l) { - istack[jstack] = ir; - istack[jstack-1] = i; - ir = j-1; - } else { - istack[jstack] = j-1; - istack[jstack-1] = l; - l = i; - } - } - } -} - -static unsigned __hot -mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ) -{ - /* - * binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id - */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = (unsigned)ids[0].mid; - - while( 0 < n ) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = mdbx_cmp2int( id, ids[cursor].mid ); - - if( val < 0 ) { - n = pivot; - - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - - } else { - return cursor; - } - } - - if( val > 0 ) { - ++cursor; - } - return cursor; -} - -static int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ) -{ - unsigned x, i; - - x = mdb_mid2l_search( ids, id->mid ); - - if( x < 1 ) { - /* internal error */ - return -2; - } - - if ( x <= ids[0].mid && ids[x].mid == id->mid ) { - /* duplicate */ - return -1; - } - - if ( ids[0].mid >= MDB_IDL_UM_MAX ) { - /* too big */ - return -2; - - } else { - /* insert id */ - ids[0].mid++; - for (i=(unsigned)ids[0].mid; i>x; i--) - ids[i] = ids[i-1]; - ids[x] = *id; - } - - return 0; -} - -static int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) -{ - /* Too big? */ - if (ids[0].mid >= MDB_IDL_UM_MAX) { - return -2; - } - ids[0].mid++; - ids[ids[0].mid] = *id; - return 0; -} - -/** @} */ -/** @} */ diff --git a/midl.h b/midl.h index 1bdffce1..eccc6099 100644 --- a/midl.h +++ b/midl.h @@ -1,190 +1,53 @@ -/** @file midl.h - * @brief LMDB ID List header file. - * - * This file was originally part of back-bdb but has been - * modified for use in libmdb. Most of the macros defined - * in this file are unused, just left over from the original. - * - * This file is only used internally in libmdb and its definitions - * are not exposed publicly. +/** A generic unsigned ID number. These were entryIDs in back-bdb. + * Preferably it should have the same size as a pointer. */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2000-2017 The OpenLDAP Foundation. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#ifndef _MDB_MIDL_H_ -#define _MDB_MIDL_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** @defgroup internal LMDB Internals - * @{ - */ - -/** @defgroup idls ID List Management - * @{ - */ - /** A generic unsigned ID number. These were entryIDs in back-bdb. - * Preferably it should have the same size as a pointer. - */ typedef size_t MDB_ID; - /** An IDL is an ID List, a sorted array of IDs. The first - * element of the array is a counter for how many actual - * IDs are in the list. In the original back-bdb code, IDLs are - * sorted in ascending order. For libmdb IDLs are sorted in - * descending order. - */ +/** An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the original back-bdb code, IDLs are + * sorted in ascending order. For libmdb IDLs are sorted in + * descending order. + */ typedef MDB_ID *MDB_IDL; /* IDL sizes - likely should be even bigger * limiting factors: sizeof(ID), thread stack size */ -#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ -#define MDB_IDL_DB_SIZE (1<. */ +#include "mdbx.h" +#include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #include #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -void* thread_entry(void *ctx) -{ - MDB_env *env = ctx; - MDB_txn *txn; - int rc; +void *thread_entry(void *ctx) { + MDB_env *env = ctx; + MDB_txn *txn; + int rc; - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - mdb_txn_abort(txn); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + mdbx_txn_abort(txn); - return NULL; + return NULL; } -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor, *cur2; - MDB_cursor_op op; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor, *cur2; + MDB_cursor_op op; + int count; + int *values; + char sval[32] = ""; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i in each iteration, since MDB_NOOVERWRITE may modify it */ - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_KEYEXIST, mdb_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { - j++; - data.mv_size = sizeof(sval); - data.mv_data = sval; - } - } - if (j) printf("%d duplicates skipped\n", j); - E(mdb_txn_commit(txn)); - E(mdb_env_stat(env, &mst)); + printf("Adding %d values\n", count); + for (i = 0; i < count; i++) { + sprintf(sval, "%03x %d foo bar", values[i], values[i]); + /* Set in each iteration, since MDB_NOOVERWRITE may modify it */ + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { + j++; + data.mv_size = sizeof(sval); + data.mv_data = sval; + } + } + if (j) + printf("%d duplicates skipped\n", j); + E(mdbx_txn_commit(txn)); + E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - j=0; - key.mv_data = sval; - for (i= count - 1; i > -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + j = 0; + key.mv_data = sval; + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor last\n"); - E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor last/prev\n"); - E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - E(mdb_cursor_get(cursor, &key, &data, MDB_PREV)); - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor last\n"); + E(mdbx_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor last/prev\n"); + E(mdbx_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + E(mdbx_cursor_get(cursor, &key, &data, MDB_PREV)); + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - printf("Deleting with cursor\n"); - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_cursor_open(txn, dbi, &cur2)); - for (i=0; i<50; i++) { - if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, MDB_NEXT))) - break; - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - E(mdb_del(txn, dbi, &key, NULL)); - } + printf("Deleting with cursor\n"); + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_cursor_open(txn, dbi, &cur2)); + for (i = 0; i < 50; i++) { + if (RES(MDB_NOTFOUND, mdbx_cursor_get(cur2, &key, &data, MDB_NEXT))) + break; + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + E(mdbx_del(txn, dbi, &key, NULL)); + } - printf("Restarting cursor in txn\n"); - for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { - if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, op))) - break; - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } - mdb_cursor_close(cur2); - E(mdb_txn_commit(txn)); + printf("Restarting cursor in txn\n"); + for (op = MDB_FIRST, i = 0; i <= 32; op = MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdbx_cursor_get(cur2, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + } + mdbx_cursor_close(cur2); + E(mdbx_txn_commit(txn)); - for(i = 0; i < 41; ++i) { - pthread_t thread; - pthread_create(&thread, NULL, thread_entry, env); - } + for (i = 0; i < 41; ++i) { + pthread_t thread; + pthread_create(&thread, NULL, thread_entry, env); + } - printf("Restarting cursor outside txn\n"); - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { - if (RES(MDB_NOTFOUND, mdb_cursor_get(cursor, &key, &data, op))) - break; - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + printf("Restarting cursor outside txn\n"); + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + for (op = MDB_FIRST, i = 0; i <= 32; op = MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdbx_cursor_get(cursor, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + } + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); - return 0; + return 0; } diff --git a/mtest1.c b/mtest1.c index ffe79123..826462dc 100644 --- a/mtest1.c +++ b/mtest1.c @@ -14,187 +14,186 @@ /* Based on mtest2.c - memory-mapped database tester/toy */ +#include "mdbx.h" +#include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32] = ""; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i -= (rand()%5)) { - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - deleted++; - } - } - free(values); - printf("Deleted %d values\n", deleted); + int deleted = 0; + key.mv_data = sval; + for (i = count - 1; i > -1; i -= (rand() % 5)) { + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + deleted++; + } + } + free(values); + printf("Deleted %d values\n", deleted); - printf("check-preset-b.cursor-next\n"); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - int present_b = 0; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++present_b; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - CHECK(present_b == present_a - deleted, "mismatch"); + printf("check-preset-b.cursor-next\n"); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + int present_b = 0; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++present_b; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + CHECK(present_b == present_a - deleted, "mismatch"); - printf("check-preset-b.cursor-prev\n"); - j = 1; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++j; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - CHECK(present_b == j, "mismatch"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + printf("check-preset-b.cursor-prev\n"); + j = 1; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++j; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + CHECK(present_b == j, "mismatch"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - /********************* LY: kept DB dirty ****************/ - mdbx_env_close_ex(env, 1); - E(mdb_env_create(&env)); - E(mdb_env_set_maxdbs(env, 4)); - E(mdb_env_open(env, DBPATH, env_oflags, 0664)); + mdbx_dbi_close(env, dbi); + /********************* LY: kept DB dirty ****************/ + mdbx_env_close_ex(env, 1); + E(mdbx_env_create(&env)); + E(mdbx_env_set_maxdbs(env, 4)); + E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - printf("check-preset-c.cursor-next\n"); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_dbi_open(txn, "id1", 0, &dbi)); - E(mdb_cursor_open(txn, dbi, &cursor)); - int present_c = 0; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++present_c; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Rolled back %d deletion(s)\n", present_c - (present_a - deleted)); - CHECK(present_c > present_a - deleted, "mismatch"); + printf("check-preset-c.cursor-next\n"); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_dbi_open(txn, "id1", 0, &dbi)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + int present_c = 0; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++present_c; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Rolled back %d deletion(s)\n", present_c - (present_a - deleted)); + CHECK(present_c > present_a - deleted, "mismatch"); - printf("check-preset-d.cursor-prev\n"); - j = 1; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++j; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - CHECK(present_c == j, "mismatch"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + printf("check-preset-d.cursor-prev\n"); + j = 1; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++j; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + CHECK(present_c == j, "mismatch"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdbx_env_close_ex(env, 0); + mdbx_dbi_close(env, dbi); + mdbx_env_close_ex(env, 0); - return 0; + return 0; } diff --git a/mtest2.c b/mtest2.c index 12b1e126..93caa6e9 100644 --- a/mtest2.c +++ b/mtest2.c @@ -17,136 +17,137 @@ /* Just like mtest.c, but using a subDB instead of the main DB */ +#include "mdbx.h" +#include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32] = ""; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + j = 0; + key.mv_data = sval; + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest3.c b/mtest3.c index a55ec604..be46fe06 100644 --- a/mtest3.c +++ b/mtest3.c @@ -16,146 +16,147 @@ */ /* Tests for sorted duplicate DBs */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32]; + char kval[sizeof(int)]; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - memset(sval, 0, sizeof(sval)); + memset(sval, 0, sizeof(sval)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(kval, "%03x", values[i & ~0x0f]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(kval, "%03x", values[i & ~0x0f]); + sprintf(sval, "%03x %d foo bar", values[i], values[i]); + key.mv_size = sizeof(int); + key.mv_data = kval; + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest4.c b/mtest4.c index 3d67a0f9..16aca90c 100644 --- a/mtest4.c +++ b/mtest4.c @@ -16,181 +16,181 @@ */ /* Tests for sorted duplicate DBs with fixed-size keys */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[8]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[8]; + char kval[sizeof(int)]; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - memset(sval, 0, sizeof(sval)); + (void)argc; + (void)argv; + memset(sval, 0, sizeof(sval)); - count = 510; - values = (int *)malloc(count*sizeof(int)); + count = 510; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%3)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%07x", values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + for (i = count - 1; i > -1; i -= (rand() % 3)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%07x", values[i]); + key.mv_size = sizeof(int); + key.mv_data = kval; + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest5.c b/mtest5.c index ed19f412..abca4e72 100644 --- a/mtest5.c +++ b/mtest5.c @@ -16,148 +16,149 @@ */ /* Tests for sorted duplicate DBs using cursor_put */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32]; + char kval[sizeof(int)]; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - memset(sval, 0, sizeof(sval)); + memset(sval, 0, sizeof(sval)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(kval, "%03x", values[i & ~0x0f]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(kval, "%03x", values[i & ~0x0f]); + sprintf(sval, "%03x %d foo bar", values[i], values[i]); + key.mv_size = sizeof(int); + key.mv_data = kval; + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest6.c b/mtest6.c index d988c93c..e7de6ab5 100644 --- a/mtest6.c +++ b/mtest6.c @@ -16,105 +16,109 @@ */ /* Tests for DB splits and merges */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif char dkbuf[1024]; -int main(int argc,char * argv[]) -{ - int i = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data, sdata; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - long kval; - char *sval; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data, sdata; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + long kval; + char *sval; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - E(mdb_env_create(&env)); - E(mdb_env_set_mapsize(env, 10485760)); - E(mdb_env_set_maxdbs(env, 4)); + E(mdbx_env_create(&env)); + E(mdbx_env_set_mapsize(env, 10485760)); + E(mdbx_env_set_maxdbs(env, 4)); - E(stat("/proc/self/exe", &exe_stat)?errno:0); - E(stat(DBPATH "/.", &db_stat)?errno:0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some dedicated storage. - */ - env_oflags = 0; - } - E(mdb_env_open(env, DBPATH, env_oflags, 0664)); + E(stat("/proc/self/exe", &exe_stat) ? errno : 0); + E(stat(DBPATH "/.", &db_stat) ? errno : 0); + env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { + /* LY: Assume running inside a CI-environment: + * 1) don't use FIXEDMAP to avoid EBUSY in case collision, + * which could be inspired by address space randomisation feature. + * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some + * dedicated storage. + */ + env_oflags = 0; + } + E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - E(mdb_txn_begin(env, NULL, 0, &txn)); - if (mdb_dbi_open(txn, "id6", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdb_drop(txn, dbi, 1)); - E(mdb_dbi_open(txn, "id6", MDB_CREATE|MDB_INTEGERKEY, &dbi)); - E(mdb_cursor_open(txn, dbi, &cursor)); - E(mdb_stat(txn, dbi, &mst)); + E(mdbx_txn_begin(env, NULL, 0, &txn)); + if (mdbx_dbi_open(txn, "id6", MDB_CREATE, &dbi) == MDB_SUCCESS) + E(mdbx_drop(txn, dbi, 1)); + E(mdbx_dbi_open(txn, "id6", MDB_CREATE | MDB_INTEGERKEY, &dbi)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + E(mdbx_stat(txn, dbi, &mst, sizeof(mst))); - sval = calloc(1, mst.ms_psize / 4); - key.mv_size = sizeof(long); - key.mv_data = &kval; - sdata.mv_size = mst.ms_psize / 4 - 30; - sdata.mv_data = sval; + sval = calloc(1, mst.ms_psize / 4); + key.mv_size = sizeof(long); + key.mv_data = &kval; + sdata.mv_size = mst.ms_psize / 4 - 30; + sdata.mv_data = sval; - printf("Adding 12 values, should yield 3 splits\n"); - for (i=0;i<12;i++) { - kval = i*5; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, mdb_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - printf("Adding 12 more values, should yield 3 splits\n"); - for (i=0;i<12;i++) { - kval = i*5+4; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, mdb_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - printf("Adding 12 more values, should yield 3 splits\n"); - for (i=0;i<12;i++) { - kval = i*5+1; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, mdb_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - E(mdb_cursor_get(cursor, &key, &data, MDB_FIRST)); + printf("Adding 12 values, should yield 3 splits\n"); + for (i = 0; i < 12; i++) { + kval = i * 5; + sprintf(sval, "%08lx", kval); + data = sdata; + (void)RES(MDB_KEYEXIST, + mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); + } + printf("Adding 12 more values, should yield 3 splits\n"); + for (i = 0; i < 12; i++) { + kval = i * 5 + 4; + sprintf(sval, "%08lx", kval); + data = sdata; + (void)RES(MDB_KEYEXIST, + mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); + } + printf("Adding 12 more values, should yield 3 splits\n"); + for (i = 0; i < 12; i++) { + kval = i * 5 + 1; + sprintf(sval, "%08lx", kval); + data = sdata; + (void)RES(MDB_KEYEXIST, + mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); + } + E(mdbx_cursor_get(cursor, &key, &data, MDB_FIRST)); - do { - printf("key: %p %s, data: %p %.*s\n", - key.mv_data, mdb_dkey(&key, dkbuf), - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0); - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_commit(txn); + do { + printf("key: %p %s, data: %p %.*s\n", key.mv_data, mdbx_dkey(&key, dkbuf), + data.mv_data, (int)data.mv_size, (char *)data.mv_data); + } while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0); + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_commit(txn); #if 0 int j=0; @@ -124,47 +128,47 @@ int main(int argc,char * argv[]) for (i= count - 1; i > -1; i-= (rand()%5)) { j++; txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdbx_txn_begin(env, NULL, 0, &txn)); sprintf(kval, "%03x", values[i & ~0x0f]); sprintf(sval, "%03x %d foo bar", values[i], values[i]); key.mv_size = sizeof(int); key.mv_data = kval; data.mv_size = sizeof(sval); data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { j--; - mdb_txn_abort(txn); + mdbx_txn_abort(txn); } else { - E(mdb_txn_commit(txn)); + E(mdbx_txn_commit(txn)); } } free(values); printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { printf("key: %.*s, data: %.*s\n", (int) key.mv_size, (char *) key.mv_data, (int) data.mv_size, (char *) data.mv_data); } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { printf("key: %.*s, data: %.*s\n", (int) key.mv_size, (char *) key.mv_data, (int) data.mv_size, (char *) data.mv_data); } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); + mdbx_dbi_close(env, dbi); #endif - mdb_env_close(env); - free(sval); + mdbx_env_close(env); + free(sval); - return 0; + return 0; } diff --git a/sample-mdb.txt b/sample-mdb.txt index 24fccdb9..194afdcc 100644 --- a/sample-mdb.txt +++ b/sample-mdb.txt @@ -33,10 +33,10 @@ int main(int argc,char * argv[]) /* Note: Most error checking omitted for simplicity */ - rc = mdb_env_create(&env); - rc = mdb_env_open(env, "./testdb", 0, 0664); - rc = mdb_txn_begin(env, NULL, 0, &txn); - rc = mdb_dbi_open(txn, NULL, 0, &dbi); + rc = mdbx_env_create(&env); + rc = mdbx_env_open(env, "./testdb", 0, 0664); + rc = mdbx_txn_begin(env, NULL, 0, &txn); + rc = mdbx_dbi_open(txn, NULL, 0, &dbi); key.mv_size = sizeof(int); key.mv_data = sval; @@ -44,23 +44,23 @@ int main(int argc,char * argv[]) data.mv_data = sval; sprintf(sval, "%03x %d foo bar", 32, 3141592); - rc = mdb_put(txn, dbi, &key, &data, 0); - rc = mdb_txn_commit(txn); + rc = mdbx_put(txn, dbi, &key, &data, 0); + rc = mdbx_txn_commit(txn); if (rc) { - fprintf(stderr, "mdb_txn_commit: (%d) %s\n", rc, mdb_strerror(rc)); + fprintf(stderr, "mdbx_txn_commit: (%d) %s\n", rc, mdbx_strerror(rc)); goto leave; } - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - rc = mdb_cursor_open(txn, dbi, &cursor); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_cursor_open(txn, dbi, &cursor); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int) key.mv_size, (char *) key.mv_data, data.mv_data, (int) data.mv_size, (char *) data.mv_data); } - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); leave: - mdb_dbi_close(env, dbi); - mdb_env_close(env); + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); return 0; } diff --git a/wbench.c b/wbench.c index e5fdc64a..debb8be9 100644 --- a/wbench.c +++ b/wbench.c @@ -12,248 +12,249 @@ * . */ +#include #include #include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include #include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif struct t0 { - struct rusage ru; - struct timespec ts; + struct rusage ru; + struct timespec ts; }; -void t0(struct t0 *t0) -{ - int rc; - E(getrusage(RUSAGE_SELF, &t0->ru)); - E(clock_gettime(CLOCK_MONOTONIC_RAW, &t0->ts)); +void t0(struct t0 *t0) { + int rc; + E(getrusage(RUSAGE_SELF, &t0->ru)); + E(clock_gettime(CLOCK_MONOTONIC_RAW, &t0->ts)); } struct info { - double wall_s, cpu_sys_s, cpu_user_s; - long iops_r, iops_w, iops_pf; + double wall_s, cpu_sys_s, cpu_user_s; + long iops_r, iops_w, iops_pf; }; -double delta_s(const struct timeval *begin, const struct timeval *end) -{ - return end->tv_sec - begin->tv_sec - + (end->tv_usec - begin->tv_usec) / 1000000.0; +double delta_s(const struct timeval *begin, const struct timeval *end) { + return end->tv_sec - begin->tv_sec + + (end->tv_usec - begin->tv_usec) / 1000000.0; } -double delta2_s(const struct timespec *begin, const struct timespec *end) -{ - return end->tv_sec - begin->tv_sec - + (end->tv_nsec - begin->tv_nsec) / 1000000000.0; +double delta2_s(const struct timespec *begin, const struct timespec *end) { + return end->tv_sec - begin->tv_sec + + (end->tv_nsec - begin->tv_nsec) / 1000000000.0; } -void measure(const struct t0 *t0, struct info *i) -{ - struct t0 t1; - int rc; +void measure(const struct t0 *t0, struct info *i) { + struct t0 t1; + int rc; - E(clock_gettime(CLOCK_MONOTONIC_RAW, &t1.ts)); - E(getrusage(RUSAGE_SELF, &t1.ru)); + E(clock_gettime(CLOCK_MONOTONIC_RAW, &t1.ts)); + E(getrusage(RUSAGE_SELF, &t1.ru)); - i->wall_s = delta2_s(&t0->ts, &t1.ts); - i->cpu_user_s = delta_s(&t0->ru.ru_utime, &t1.ru.ru_utime); - i->cpu_sys_s = delta_s(&t0->ru.ru_stime, &t1.ru.ru_stime); - i->iops_r = t1.ru.ru_inblock - t0->ru.ru_inblock; - i->iops_w = t1.ru.ru_oublock - t0->ru.ru_oublock; - i->iops_pf = t1.ru.ru_majflt - t0->ru.ru_majflt - + t1.ru.ru_minflt - t0->ru.ru_minflt; + i->wall_s = delta2_s(&t0->ts, &t1.ts); + i->cpu_user_s = delta_s(&t0->ru.ru_utime, &t1.ru.ru_utime); + i->cpu_sys_s = delta_s(&t0->ru.ru_stime, &t1.ru.ru_stime); + i->iops_r = t1.ru.ru_inblock - t0->ru.ru_inblock; + i->iops_w = t1.ru.ru_oublock - t0->ru.ru_oublock; + i->iops_pf = + t1.ru.ru_majflt - t0->ru.ru_majflt + t1.ru.ru_minflt - t0->ru.ru_minflt; } -void print(struct info *i) -{ - printf("wall-clock %.3f, iops: %lu reads, %lu writes, %lu page-faults, " - "cpu: %.3f user, %.3f sys\n", - i->wall_s, i->iops_r, i->iops_w, i->iops_pf, i->cpu_user_s, i->cpu_sys_s); - +void print(struct info *i) { + printf("wall-clock %.3f, iops: %lu reads, %lu writes, %lu page-faults, " + "cpu: %.3f user, %.3f sys\n", + i->wall_s, i->iops_r, i->iops_w, i->iops_pf, i->cpu_user_s, + i->cpu_sys_s); } -static void wbench(int flags, int mb, int count, int salt) -{ - MDB_env *env; - MDB_dbi dbi; - MDB_txn *txn; - MDB_val key, data; - unsigned key_value = salt; - char data_value[777]; - int i, rc; - struct t0 start; - struct info ra, rd, rs, rt; +static void wbench(int flags, int mb, int count, int salt) { + MDB_env *env; + MDB_dbi dbi; + MDB_txn *txn; + MDB_val key, data; + unsigned key_value = salt; + char data_value[777]; + int i, rc; + struct t0 start; + struct info ra, rd, rs, rt; - mkdir(DBPATH, 0755); - unlink(DBPATH "/data.mdb"); - unlink(DBPATH "/lock.mdb"); + mkdir(DBPATH, 0755); + unlink(DBPATH "/data.mdb"); + unlink(DBPATH "/lock.mdb"); - printf("\nProbing %d Mb, %d items, flags:", mb, count); - if (flags & MDB_NOSYNC) - printf(" NOSYNC"); - if (flags & MDB_NOMETASYNC) - printf(" NOMETASYNC"); - if (flags & MDB_WRITEMAP) - printf(" WRITEMAP"); - if (flags & MDB_MAPASYNC) - printf(" MAPASYNC"); + printf("\nProbing %d Mb, %d items, flags:", mb, count); + if (flags & MDB_NOSYNC) + printf(" NOSYNC"); + if (flags & MDB_NOMETASYNC) + printf(" NOMETASYNC"); + if (flags & MDB_WRITEMAP) + printf(" WRITEMAP"); + if (flags & MDB_MAPASYNC) + printf(" MAPASYNC"); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - if (flags & MDBX_COALESCE) - printf(" COALESCE"); - if (flags & MDBX_LIFORECLAIM) - printf(" LIFO"); + if (flags & MDBX_COALESCE) + printf(" COALESCE"); + if (flags & MDBX_LIFORECLAIM) + printf(" LIFO"); #endif - printf(" 0x%X\n", flags); + printf(" 0x%X\n", flags); - E(mdb_env_create(&env)); - E(mdb_env_set_mapsize(env, (1ull << 20) * mb)); - E(mdb_env_open(env, DBPATH, flags, 0664)); + E(mdbx_env_create(&env)); + E(mdbx_env_set_mapsize(env, (1ull << 20) * mb)); + E(mdbx_env_open(env, DBPATH, flags, 0664)); - key.mv_size = sizeof(key_value); - key.mv_data = &key_value; - data.mv_size = sizeof(data_value); - data.mv_data = &data_value; + key.mv_size = sizeof(key_value); + key.mv_data = &key_value; + data.mv_size = sizeof(data_value); + data.mv_data = &data_value; - printf("\tAdding %d values...", count); - fflush(stdout); - key_value = salt; - t0(&start); - for(i = 0; i < count; ++i) { - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_dbi_open(txn, NULL, 0, &dbi)); + printf("\tAdding %d values...", count); + fflush(stdout); + key_value = salt; + t0(&start); + for (i = 0; i < count; ++i) { + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_dbi_open(txn, NULL, 0, &dbi)); - snprintf(data_value, sizeof(data_value), "value=%u", key_value); - E(mdb_put(txn, dbi, &key, &data, MDB_NOOVERWRITE)); - E(mdb_txn_commit(txn)); + snprintf(data_value, sizeof(data_value), "value=%u", key_value); + E(mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE)); + E(mdbx_txn_commit(txn)); - key_value = key_value * 1664525 + 1013904223; - } - measure(&start, &ra); - print(&ra); + key_value = key_value * 1664525 + 1013904223; + } + measure(&start, &ra); + print(&ra); - printf("\tDeleting %d values...", count); - fflush(stdout); - key_value = salt; - t0(&start); - for(i = 0; i < count; ++i) { - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_dbi_open(txn, NULL, 0, &dbi)); + printf("\tDeleting %d values...", count); + fflush(stdout); + key_value = salt; + t0(&start); + for (i = 0; i < count; ++i) { + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_dbi_open(txn, NULL, 0, &dbi)); - E(mdb_del(txn, dbi, &key, NULL)); - E(mdb_txn_commit(txn)); + E(mdbx_del(txn, dbi, &key, NULL)); + E(mdbx_txn_commit(txn)); - key_value = key_value * 1664525 + 1013904223; - } - measure(&start, &rd); - print(&rd); + key_value = key_value * 1664525 + 1013904223; + } + measure(&start, &rd); + print(&rd); - printf("\tCheckpoint..."); - fflush(stdout); - t0(&start); - mdb_env_sync(env, 1); - measure(&start, &rs); - print(&rs); + printf("\tCheckpoint..."); + fflush(stdout); + t0(&start); + mdbx_env_sync(env, 1); + measure(&start, &rs); + print(&rs); - mdb_env_close(env); - rt.wall_s = ra.wall_s + rd.wall_s + rs.wall_s; - rt.cpu_sys_s = ra.cpu_sys_s + rd.cpu_sys_s + rs.cpu_sys_s; - rt.cpu_user_s = ra.cpu_user_s + rd.cpu_user_s + rs.cpu_user_s; - rt.iops_r = ra.iops_r + rd.iops_r + rs.iops_r; - rt.iops_w = ra.iops_w + rd.iops_w + rs.iops_w; - rt.iops_pf = ra.iops_pf + rd.iops_pf + rs.iops_pf; - printf("Total "); - print(&rt); + mdbx_env_close(env); + rt.wall_s = ra.wall_s + rd.wall_s + rs.wall_s; + rt.cpu_sys_s = ra.cpu_sys_s + rd.cpu_sys_s + rs.cpu_sys_s; + rt.cpu_user_s = ra.cpu_user_s + rd.cpu_user_s + rs.cpu_user_s; + rt.iops_r = ra.iops_r + rd.iops_r + rs.iops_r; + rt.iops_w = ra.iops_w + rd.iops_w + rs.iops_w; + rt.iops_pf = ra.iops_pf + rd.iops_pf + rs.iops_pf; + printf("Total "); + print(&rt); - fprintf(stderr, "flags: "); - if (flags & MDB_NOSYNC) - fprintf(stderr, " NOSYNC"); - if (flags & MDB_NOMETASYNC) - fprintf(stderr, " NOMETASYNC"); - if (flags & MDB_WRITEMAP) - fprintf(stderr, " WRITEMAP"); - if (flags & MDB_MAPASYNC) - fprintf(stderr, " MAPASYNC"); + fprintf(stderr, "flags: "); + if (flags & MDB_NOSYNC) + fprintf(stderr, " NOSYNC"); + if (flags & MDB_NOMETASYNC) + fprintf(stderr, " NOMETASYNC"); + if (flags & MDB_WRITEMAP) + fprintf(stderr, " WRITEMAP"); + if (flags & MDB_MAPASYNC) + fprintf(stderr, " MAPASYNC"); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - if (flags & MDBX_COALESCE) - fprintf(stderr, " COALESCE"); - if (flags & MDBX_LIFORECLAIM) - fprintf(stderr, " LIFO"); + if (flags & MDBX_COALESCE) + fprintf(stderr, " COALESCE"); + if (flags & MDBX_LIFORECLAIM) + fprintf(stderr, " LIFO"); #endif - fprintf(stderr, "\t%.3f\t%.3f\t%.3f\t%.3f\n", rt.iops_w / 1000.0, rt.cpu_user_s, rt.cpu_sys_s, rt.wall_s); - + fprintf(stderr, "\t%.3f\t%.3f\t%.3f\t%.3f\n", rt.iops_w / 1000.0, + rt.cpu_user_s, rt.cpu_sys_s, rt.wall_s); } -int main(int argc,char * argv[]) -{ - (void) argc; - (void) argv; +int main(int argc, char *argv[]) { + (void)argc; + (void)argv; -#define SALT 1 -#define COUNT 10000 -#define SIZE 12 +#define SALT 1 +#define COUNT 10000 +#define SIZE 12 - printf("\nDefault 'sync' mode..."); - wbench(0, SIZE, COUNT, SALT); + printf("\nDefault 'sync' mode..."); + wbench(0, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); // wbench(MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nno-meta-sync hack..."); - wbench(MDB_NOMETASYNC, SIZE, COUNT, SALT); + printf("\nno-meta-sync hack..."); + wbench(MDB_NOMETASYNC, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_NOMETASYNC | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_NOMETASYNC | MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDB_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); // wbench(MDB_NOMETASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nno-sync..."); - wbench(MDB_NOSYNC, SIZE, COUNT, SALT); + printf("\nno-sync..."); + wbench(MDB_NOSYNC, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) // wbench(MDB_NOSYNC | MDBX_COALESCE, SIZE, COUNT, SALT); -// wbench(MDB_NOSYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); +// wbench(MDB_NOSYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, +// SALT); // wbench(MDB_NOSYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nr/w-map..."); - wbench(MDB_WRITEMAP, SIZE, COUNT, SALT); + printf("\nr/w-map..."); + wbench(MDB_WRITEMAP, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); // wbench(MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nasync..."); - wbench(MDB_WRITEMAP | MDB_MAPASYNC, SIZE, COUNT, SALT); + printf("\nasync..."); + wbench(MDB_WRITEMAP | MDB_MAPASYNC, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -// wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE, SIZE, COUNT, + // SALT); + wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, + COUNT, SALT); +// wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, +// SALT); #endif - printf("\nr/w-map + no-sync..."); - wbench(MDB_NOSYNC | MDB_WRITEMAP, SIZE, COUNT, SALT); + printf("\nr/w-map + no-sync..."); + wbench(MDB_NOSYNC | MDB_WRITEMAP, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -// wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, + COUNT, SALT); +// wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, +// SALT); #endif - return 0; + return 0; } diff --git a/yota_test1.c b/yota_test1.c index 0cad5468..701d748c 100644 --- a/yota_test1.c +++ b/yota_test1.c @@ -1,6 +1,7 @@ /* * Copyright 2016-2017 Leonid Yuriev . - * Copyright 2015 Vladimir Romanov , Yota Lab. + * Copyright 2015 Vladimir Romanov + * , Yota Lab. * * This file is part of libmdbx. * @@ -18,243 +19,259 @@ * along with this program. If not, see . */ -#include #include +#include +#include "mdbx.h" +#include #include #include -#include -#include #include +#include #include -#include -#include "mdbx.h" +#include -#define IP_PRINTF_ARG_HOST(addr) (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), (int)((addr) & 0xff) +#define IP_PRINTF_ARG_HOST(addr) \ + (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), \ + (int)((addr)&0xff) -char opt_db_path[PATH_MAX] = "/dev/shm/lmdb_bench1"; +char opt_db_path[PATH_MAX] = "/dev/shm/x_bench1"; static MDB_env *env; #define REC_COUNT 1000000 int64_t ids[REC_COUNT + REC_COUNT / 10]; int32_t ids_count = 0; -int64_t lmdb_add = 0; -int64_t lmdb_del = 0; +int64_t x_add = 0; +int64_t x_del = 0; int64_t obj_id = 0; static void add_id_to_pool(int64_t id) { - ids[ids_count] = id; - ids_count++; + ids[ids_count] = id; + ids_count++; } static inline int64_t getTimeMicroseconds(void) { - struct timeval val; - gettimeofday(&val, NULL); - return val.tv_sec * ((int64_t) 1000000) + val.tv_usec; + struct timeval val; + gettimeofday(&val, NULL); + return val.tv_sec * ((int64_t)1000000) + val.tv_usec; } static int64_t get_id_from_pool() { - if (ids_count == 0) { - return -1; - } - int32_t index = rand() % ids_count; - int64_t id = ids[index]; - ids[index] = ids[ids_count - 1]; - ids_count--; - return id; + if (ids_count == 0) { + return -1; + } + int32_t index = rand() % ids_count; + int64_t id = ids[index]; + ids[index] = ids[ids_count - 1]; + ids_count--; + return id; } -#define LMDB_CHECK(x) \ - do {\ - const int rc = (x);\ - if ( rc != MDB_SUCCESS ) {\ - printf("Error [%d] %s in %s at %s:%d\n", rc, mdb_strerror(rc), #x, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - }\ - } while(0) +#define LMDB_CHECK(x) \ + do { \ + const int rc = (x); \ + if (rc != MDB_SUCCESS) { \ + printf("Error [%d] %s in %s at %s:%d\n", rc, mdbx_strerror(rc), #x, \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) static void db_connect() { - LMDB_CHECK(mdb_env_create(&env)); - LMDB_CHECK(mdb_env_set_mapsize(env, 3L * 1024L * 1024L * 1024L)); - LMDB_CHECK(mdb_env_set_maxdbs(env, 30)); + LMDB_CHECK(mdbx_env_create(&env)); + LMDB_CHECK(mdbx_env_set_mapsize(env, 3L * 1024L * 1024L * 1024L)); + LMDB_CHECK(mdbx_env_set_maxdbs(env, 30)); #if defined(MDBX_LIFORECLAIM) - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); + LMDB_CHECK(mdbx_env_open( + env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); #else - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); + LMDB_CHECK(mdbx_env_open(env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); #endif - printf("Connection open\n"); + printf("Connection open\n"); } typedef struct { - char session_id1[100]; - char session_id2[100]; - char ip[20]; - uint8_t fill[100]; + char session_id1[100]; + char session_id2[100]; + char ip[20]; + uint8_t fill[100]; } session_data_t; typedef struct { - int64_t obj_id; - int8_t event_type; + int64_t obj_id; + int8_t event_type; } __attribute__((__packed__)) event_data_t; static void create_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - session_data_t data; - // transaction init - snprintf(data.session_id1, sizeof (data.session_id1), "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", record_id % 3 + 1, record_id % 9 + 1, record_id); - snprintf(data.session_id2, sizeof (data.session_id2), "gx_service;%ld;%ld;node@spb-jsm1", record_id, record_id % 1000000000 + 99999); - snprintf(data.ip, sizeof (data.ip), "%d.%d.%d.%d", IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); - event.obj_id = record_id; - event.event_type = 1; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; + session_data_t data; + // transaction init + snprintf(data.session_id1, sizeof(data.session_id1), + "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", + record_id % 3 + 1, record_id % 9 + 1, record_id); + snprintf(data.session_id2, sizeof(data.session_id2), + "gx_service;%ld;%ld;node@spb-jsm1", record_id, + record_id % 1000000000 + 99999); + snprintf(data.ip, sizeof(data.ip), "%d.%d.%d.%d", + IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); + event.obj_id = record_id; + event.event_type = 1; - MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; - MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; - MDB_val _ip_rec = {data.ip, strlen(data.ip)}; - MDB_val _obj_id_rec = {&record_id, sizeof (record_id)}; - MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + (rand() % sizeof (data.fill))}; - MDB_val _event_rec = {&event, sizeof (event)}; + MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; + MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; + MDB_val _ip_rec = {data.ip, strlen(data.ip)}; + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + + (rand() % sizeof(data.fill))}; + MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - LMDB_CHECK(mdb_put(txn, dbi_session, &_obj_id_rec, &_data_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); - LMDB_CHECK(mdb_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + LMDB_CHECK(mdbx_put(txn, dbi_session, &_obj_id_rec, &_data_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); + LMDB_CHECK(mdbx_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_add++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_add++; } static void delete_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; - // transaction init - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // put data - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val v_rec; - // get data - LMDB_CHECK(mdb_get(txn, dbi_session, &_obj_id_rec, &v_rec)); - session_data_t* data = (session_data_t*) v_rec.mv_data; + // transaction init + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + // open database in read-write mode + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + // put data + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val v_rec; + // get data + LMDB_CHECK(mdbx_get(txn, dbi_session, &_obj_id_rec, &v_rec)); + session_data_t *data = (session_data_t *)v_rec.mv_data; - MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; - MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; - MDB_val _ip_rec = {data->ip, strlen(data->ip)}; - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id1_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id2_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_ip, &_ip_rec, NULL)); - event.obj_id = record_id; - event.event_type = 1; - MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_del(txn, dbi_event, &_event_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session, &_obj_id_rec, NULL)); + MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; + MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; + MDB_val _ip_rec = {data->ip, strlen(data->ip)}; + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id1_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id2_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_ip, &_ip_rec, NULL)); + event.obj_id = record_id; + event.event_type = 1; + MDB_val _event_rec = {&event, sizeof(event)}; + LMDB_CHECK(mdbx_del(txn, dbi_event, &_event_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session, &_obj_id_rec, NULL)); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_del++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_del++; } static void db_disconnect() { - mdb_env_close(env); - printf("Connection closed\n"); + mdbx_env_close(env); + printf("Connection closed\n"); } -static void get_db_stat(const char* db, int64_t* ms_branch_pages, int64_t* ms_leaf_pages) { - MDB_txn *txn; - MDB_stat stat; - MDB_dbi dbi; +static void get_db_stat(const char *db, int64_t *ms_branch_pages, + int64_t *ms_leaf_pages) { + MDB_txn *txn; + MDBX_stat stat; + MDB_dbi dbi; - LMDB_CHECK(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdb_stat(txn, dbi, &stat)); - mdb_txn_abort(txn); - printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", - db, - stat.ms_branch_pages, - stat.ms_depth, - stat.ms_entries, - stat.ms_leaf_pages, - stat.ms_overflow_pages); - (*ms_branch_pages) += stat.ms_branch_pages; - (*ms_leaf_pages) += stat.ms_leaf_pages; + LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); + LMDB_CHECK(mdbx_stat(txn, dbi, &stat, sizeof(stat))); + mdbx_txn_abort(txn); + printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, + stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, + stat.ms_leaf_pages, stat.ms_overflow_pages); + (*ms_branch_pages) += stat.ms_branch_pages; + (*ms_leaf_pages) += stat.ms_leaf_pages; } static void periodic_stat(void) { - int64_t ms_branch_pages = 0; - int64_t ms_leaf_pages = 0; - printf(" Name | ms_branch_pages | depth | entries | leaf_pages | overf_pages |\n"); - get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); - printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, "", "", ms_leaf_pages, ""); - static int64_t prev_add; - static int64_t prev_del; - static int64_t t = -1; - if (t > 0) { - int64_t delta = getTimeMicroseconds() - t; - printf("CPS: add %ld, delete %ld, items processed - %ld\n", (lmdb_add - prev_add)*1000000 / delta, (lmdb_del - prev_del)*1000000 / delta, obj_id); - } - t = getTimeMicroseconds(); - prev_add = lmdb_add; - prev_del = lmdb_del; + int64_t ms_branch_pages = 0; + int64_t ms_leaf_pages = 0; + printf(" Name | ms_branch_pages | depth | entries | " + "leaf_pages | overf_pages |\n"); + get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); + printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, + "", "", ms_leaf_pages, ""); + static int64_t prev_add; + static int64_t prev_del; + static int64_t t = -1; + if (t > 0) { + int64_t delta = getTimeMicroseconds() - t; + printf("CPS: add %ld, delete %ld, items processed - %ld\n", + (x_add - prev_add) * 1000000 / delta, + (x_del - prev_del) * 1000000 / delta, obj_id); + } + t = getTimeMicroseconds(); + prev_add = x_add; + prev_del = x_del; } static void periodic_add_rec() { - int i; - for (i = 0; i < 10000; i++) { - if (ids_count <= REC_COUNT) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - } - if (ids_count > REC_COUNT) { - int64_t id = get_id_from_pool(); - delete_record(id); - } - } - periodic_stat(); + int i; + for (i = 0; i < 10000; i++) { + if (ids_count <= REC_COUNT) { + int64_t id = obj_id++; + create_record(id); + add_id_to_pool(id); + } + if (ids_count > REC_COUNT) { + int64_t id = get_id_from_pool(); + delete_record(id); + } + } + periodic_stat(); } -int main(int argc, char** argv) { - (void) argc; - (void) argv; +int main(int argc, char **argv) { + (void)argc; + (void)argv; - char filename[PATH_MAX]; - mkdir(opt_db_path, 0775); + char filename[PATH_MAX]; + mkdir(opt_db_path, 0775); - strcpy(filename, opt_db_path); - strcat(filename, "/data.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/data.mdb"); + remove(filename); - strcpy(filename, opt_db_path); - strcat(filename, "/lock.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/lock.mdb"); + remove(filename); - db_connect(); - while (1) { - periodic_add_rec(); - } - db_disconnect(); - return 0; + db_connect(); + while (1) { + periodic_add_rec(); + } + db_disconnect(); + return 0; } diff --git a/yota_test2.c b/yota_test2.c index 80dc4f2f..69d41c7c 100644 --- a/yota_test2.c +++ b/yota_test2.c @@ -1,6 +1,7 @@ /* * Copyright 2016-2017 Leonid Yuriev . - * Copyright 2015 Vladimir Romanov , Yota Lab. + * Copyright 2015 Vladimir Romanov + * , Yota Lab. * * This file is part of libmdbx. * @@ -18,233 +19,257 @@ * along with this program. If not, see . */ -#include #include +#include +#include "mdbx.h" +#include #include #include -#include -#include #include +#include #include -#include -#include "mdbx.h" +#include -#define IP_PRINTF_ARG_HOST(addr) (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), (int)((addr) & 0xff) +#define IP_PRINTF_ARG_HOST(addr) \ + (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), \ + (int)((addr)&0xff) -char opt_db_path[PATH_MAX] = "/dev/shm/lmdb_bench2"; +char opt_db_path[PATH_MAX] = "/dev/shm/x_bench2"; static MDB_env *env; #define REC_COUNT 1024000 int64_t ids[REC_COUNT * 10]; int32_t ids_count = 0; -int64_t lmdb_add = 0; -int64_t lmdb_del = 0; +int64_t x_add = 0; +int64_t x_del = 0; int64_t obj_id = 0; -int64_t lmdb_data_size = 0; -int64_t lmdb_key_size = 0; +int64_t x_data_size = 0; +int64_t x_key_size = 0; static void add_id_to_pool(int64_t id) { - ids[ids_count] = id; - ids_count++; + ids[ids_count] = id; + ids_count++; } static inline int64_t getTimeMicroseconds(void) { - struct timeval val; - gettimeofday(&val, NULL); - return val.tv_sec * ((int64_t) 1000000) + val.tv_usec; + struct timeval val; + gettimeofday(&val, NULL); + return val.tv_sec * ((int64_t)1000000) + val.tv_usec; } static int64_t get_id_from_pool() { - if (ids_count == 0) { - return -1; - } - int32_t index = rand() % ids_count; - int64_t id = ids[index]; - ids[index] = ids[ids_count - 1]; - ids_count--; - return id; + if (ids_count == 0) { + return -1; + } + int32_t index = rand() % ids_count; + int64_t id = ids[index]; + ids[index] = ids[ids_count - 1]; + ids_count--; + return id; } -#define LMDB_CHECK(x) \ - do {\ - const int rc = (x);\ - if ( rc != MDB_SUCCESS ) {\ - printf("Error [%d] %s in %s at %s:%d\n", rc, mdb_strerror(rc), #x, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - }\ - } while(0) +#define LMDB_CHECK(x) \ + do { \ + const int rc = (x); \ + if (rc != MDB_SUCCESS) { \ + printf("Error [%d] %s in %s at %s:%d\n", rc, mdbx_strerror(rc), #x, \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) static void db_connect() { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; - LMDB_CHECK(mdb_env_create(&env)); - LMDB_CHECK(mdb_env_set_mapsize(env, 300000L * 4096L)); - LMDB_CHECK(mdb_env_set_maxdbs(env, 30)); + LMDB_CHECK(mdbx_env_create(&env)); + LMDB_CHECK(mdbx_env_set_mapsize(env, 300000L * 4096L)); + LMDB_CHECK(mdbx_env_set_maxdbs(env, 30)); #if defined(MDBX_LIFORECLAIM) - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); + LMDB_CHECK(mdbx_env_open( + env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); #else - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); + LMDB_CHECK(mdbx_env_open(env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); #endif - MDB_txn *txn; + MDB_txn *txn; - // transaction init - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - printf("Connection open\n"); + // transaction init + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + // open database in read-write mode + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + printf("Connection open\n"); } typedef struct { - char session_id1[100]; - char session_id2[100]; - char ip[20]; - uint8_t fill[100]; + char session_id1[100]; + char session_id2[100]; + char ip[20]; + uint8_t fill[100]; } session_data_t; typedef struct { - int64_t obj_id; - int8_t event_type; + int64_t obj_id; + int8_t event_type; } __attribute__((__packed__)) event_data_t; static void create_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - session_data_t data; - // transaction init - snprintf(data.session_id1, sizeof (data.session_id1), "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", record_id % 3 + 1, record_id % 9 + 1, record_id); - snprintf(data.session_id2, sizeof (data.session_id2), "gx_service;%ld;%ld;node@spb-jsm1", record_id, record_id % 1000000000 + 99999); - snprintf(data.ip, sizeof (data.ip), "%d.%d.%d.%d", IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); - event.obj_id = record_id; - event.event_type = 1; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; + session_data_t data; + // transaction init + snprintf(data.session_id1, sizeof(data.session_id1), + "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", + record_id % 3 + 1, record_id % 9 + 1, record_id); + snprintf(data.session_id2, sizeof(data.session_id2), + "gx_service;%ld;%ld;node@spb-jsm1", record_id, + record_id % 1000000000 + 99999); + snprintf(data.ip, sizeof(data.ip), "%d.%d.%d.%d", + IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); + event.obj_id = record_id; + event.event_type = 1; - MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; - MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; - MDB_val _ip_rec = {data.ip, strlen(data.ip)}; - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + (rand() % sizeof (data.fill))}; - MDB_val _event_rec = {&event, sizeof(event)}; + MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; + MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; + MDB_val _ip_rec = {data.ip, strlen(data.ip)}; + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + + (rand() % sizeof(data.fill))}; + MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - LMDB_CHECK(mdb_put(txn, dbi_session, &_obj_id_rec, &_data_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); - LMDB_CHECK(mdb_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); - lmdb_data_size += (_data_rec.mv_size + _obj_id_rec.mv_size * 4); - lmdb_key_size += (_obj_id_rec.mv_size + _session_id1_rec.mv_size + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + LMDB_CHECK(mdbx_put(txn, dbi_session, &_obj_id_rec, &_data_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); + LMDB_CHECK(mdbx_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); + x_data_size += (_data_rec.mv_size + _obj_id_rec.mv_size * 4); + x_key_size += + (_obj_id_rec.mv_size + _session_id1_rec.mv_size + + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_add++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_add++; } static void delete_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; - // transaction init - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // put data - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val _data_rec; - // get data - LMDB_CHECK(mdb_get(txn, dbi_session, &_obj_id_rec, &_data_rec)); - session_data_t* data = (session_data_t*) _data_rec.mv_data; + // transaction init + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + // open database in read-write mode + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + // put data + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val _data_rec; + // get data + LMDB_CHECK(mdbx_get(txn, dbi_session, &_obj_id_rec, &_data_rec)); + session_data_t *data = (session_data_t *)_data_rec.mv_data; - MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; - MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; - MDB_val _ip_rec = {data->ip, strlen(data->ip)}; - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id1_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id2_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_ip, &_ip_rec, NULL)); - event.obj_id = record_id; - event.event_type = 1; - MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_del(txn, dbi_event, &_event_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session, &_obj_id_rec, NULL)); + MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; + MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; + MDB_val _ip_rec = {data->ip, strlen(data->ip)}; + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id1_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id2_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_ip, &_ip_rec, NULL)); + event.obj_id = record_id; + event.event_type = 1; + MDB_val _event_rec = {&event, sizeof(event)}; + LMDB_CHECK(mdbx_del(txn, dbi_event, &_event_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session, &_obj_id_rec, NULL)); - lmdb_data_size -= (_data_rec.mv_size + _obj_id_rec.mv_size * 4); - lmdb_key_size -= (_obj_id_rec.mv_size + _session_id1_rec.mv_size + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); + x_data_size -= (_data_rec.mv_size + _obj_id_rec.mv_size * 4); + x_key_size -= + (_obj_id_rec.mv_size + _session_id1_rec.mv_size + + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_del++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_del++; } static void db_disconnect() { - mdb_env_close(env); - printf("Connection closed\n"); + mdbx_env_close(env); + printf("Connection closed\n"); } -static void get_db_stat(const char* db, int64_t* ms_branch_pages, int64_t* ms_leaf_pages) { - MDB_txn *txn; - MDB_stat stat; - MDB_dbi dbi; +static void get_db_stat(const char *db, int64_t *ms_branch_pages, + int64_t *ms_leaf_pages) { + MDB_txn *txn; + MDBX_stat stat; + MDB_dbi dbi; - LMDB_CHECK(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdb_stat(txn, dbi, &stat)); - mdb_txn_abort(txn); - printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", - db, - stat.ms_branch_pages, - stat.ms_depth, - stat.ms_entries, - stat.ms_leaf_pages, - stat.ms_overflow_pages); - (*ms_branch_pages) += stat.ms_branch_pages; - (*ms_leaf_pages) += stat.ms_leaf_pages; + LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); + LMDB_CHECK(mdbx_stat(txn, dbi, &stat, sizeof(stat))); + mdbx_txn_abort(txn); + printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, + stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, + stat.ms_leaf_pages, stat.ms_overflow_pages); + (*ms_branch_pages) += stat.ms_branch_pages; + (*ms_leaf_pages) += stat.ms_leaf_pages; } static void periodic_stat(void) { - int64_t ms_branch_pages = 0; - int64_t ms_leaf_pages = 0; - printf(" Name | ms_branch_pages | depth | entries | leaf_pages | overf_pages |\n"); - get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); - printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, "", "", ms_leaf_pages, ""); - static int64_t prev_add; - static int64_t prev_del; - static int64_t t = -1; - if (t > 0) { - int64_t delta = getTimeMicroseconds() - t; - printf("CPS: add %ld, delete %ld, items processed - %ldK data=%ldK key=%ldK\n", (lmdb_add - prev_add)*1000000 / delta, (lmdb_del - prev_del)*1000000 / delta, obj_id / 1024, lmdb_data_size / 1024, lmdb_key_size / 1024); - printf("usage data=%ld%%\n", ((lmdb_data_size + lmdb_key_size) * 100) / ((ms_leaf_pages + ms_branch_pages)*4096)); - } - t = getTimeMicroseconds(); - prev_add = lmdb_add; - prev_del = lmdb_del; + int64_t ms_branch_pages = 0; + int64_t ms_leaf_pages = 0; + printf(" Name | ms_branch_pages | depth | entries | " + "leaf_pages | overf_pages |\n"); + get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); + printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, + "", "", ms_leaf_pages, ""); + static int64_t prev_add; + static int64_t prev_del; + static int64_t t = -1; + if (t > 0) { + int64_t delta = getTimeMicroseconds() - t; + printf("CPS: add %ld, delete %ld, items processed - %ldK data=%ldK " + "key=%ldK\n", + (x_add - prev_add) * 1000000 / delta, + (x_del - prev_del) * 1000000 / delta, obj_id / 1024, + x_data_size / 1024, x_key_size / 1024); + printf("usage data=%ld%%\n", + ((x_data_size + x_key_size) * 100) / + ((ms_leaf_pages + ms_branch_pages) * 4096)); + } + t = getTimeMicroseconds(); + prev_add = x_add; + prev_del = x_del; } -//static void periodic_add_rec() { +// static void periodic_add_rec() { // for (int i = 0; i < 10240; i++) { // if (ids_count <= REC_COUNT) { // int64_t id = obj_id++; @@ -259,52 +284,52 @@ static void periodic_stat(void) { // periodic_stat(); //} -int main(int argc, char** argv) { - (void) argc; - (void) argv; +int main(int argc, char **argv) { + (void)argc; + (void)argv; - char filename[PATH_MAX]; - int i; - int64_t t; + char filename[PATH_MAX]; + int i; + int64_t t; - mkdir(opt_db_path, 0775); + mkdir(opt_db_path, 0775); - strcpy(filename, opt_db_path); - strcat(filename, "/data.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/data.mdb"); + remove(filename); - strcpy(filename, opt_db_path); - strcat(filename, "/lock.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/lock.mdb"); + remove(filename); - db_connect(); - periodic_stat(); - for (i = 0; i < 1024000; i++) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - } - periodic_stat(); - t = getTimeMicroseconds(); - while (1) { - int i; - int64_t now; - for (i = 0; i < 100; i++) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - id = get_id_from_pool(); - delete_record(id); - } - //int64_t id = obj_id++; - //create_record(id); - //add_id_to_pool(id); - now = getTimeMicroseconds(); - if ((now - t) > 100000) { - periodic_stat(); - t = now; - } - } - db_disconnect(); - return 0; + db_connect(); + periodic_stat(); + for (i = 0; i < 1024000; i++) { + int64_t id = obj_id++; + create_record(id); + add_id_to_pool(id); + } + periodic_stat(); + t = getTimeMicroseconds(); + while (1) { + int i; + int64_t now; + for (i = 0; i < 100; i++) { + int64_t id = obj_id++; + create_record(id); + add_id_to_pool(id); + id = get_id_from_pool(); + delete_record(id); + } + // int64_t id = obj_id++; + // create_record(id); + // add_id_to_pool(id); + now = getTimeMicroseconds(); + if ((now - t) > 100000) { + periodic_stat(); + t = now; + } + } + db_disconnect(); + return 0; }